本文整理汇总了Python中pyspark.sql.SQLContext.jsonFile方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.jsonFile方法的具体用法?Python SQLContext.jsonFile怎么用?Python SQLContext.jsonFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.SQLContext
的用法示例。
在下文中一共展示了SQLContext.jsonFile方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_recommendations
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
class RecommendationEngine:
"""A travel recommendation engine
"""
def get_recommendations(self, user_id):
"""Recommends travel for user
"""
data = (1,2,3,4,5)
even_rdd = self.sc.parallelize(data)
#ratings = even_rdd.collect()
reco = self.sqlContext.sql("SELECT c.contact_id, o.prod_id FROM contacts c , offres o WHERE o.continent_offre = c.continent and o.envie_offre = c.envie and o.moyen_offre = c.moyen").collect()
return reco
def __init__(self, sc):
"""Init the recommendation engine given a Spark context and a dataset path
"""
logger.info("Starting up the Recommendation Engine: ")
self.sc = sc
self.sqlContext = SQLContext(sc)
path_contacts = "data_v3/contacts/attempt_contactV3_perfect_match.json"
df_contacts = self.sqlContext.jsonFile(path_contacts)
df_contacts.registerTempTable("contacts")
path_offres = "data_v3/offres/attempt_productV3_perfect_match.json"
df_offres = self.sqlContext.jsonFile(path_offres)
df_offres.registerTempTable("offres")
示例2: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
def main():
log = logging.getLogger(prog)
log.setLevel(logging.INFO)
# bit hackish and hard to keep aligned with docstring changes, not using this
# usage = '\r\b\r\b\r' + __doc__ + "usage: %prog -j file.json -p directory.parquet"
# parser = OptionParser(usage=usage, version='%prog ' + __version__)
parser = OptionParser(version='%prog ' + __version__)
parser.add_option('-j', '--json', dest='jsonFile', help='JSON input file/dir', metavar='<file/dir>')
parser.add_option('-p', '--parquetDir', dest='parquetDir', help='Parquet output dir', metavar='<dir>')
(options, args) = parser.parse_args()
jsonFile = options.jsonFile
parquetDir = options.parquetDir
if args or not jsonFile or not parquetDir:
usage(parser)
conf = SparkConf().setAppName('HS PySpark JSON => Parquet')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
spark_version = sc.version
log.info('Spark version detected as %s' % spark_version)
if not isVersionLax(spark_version):
die("Spark version couldn't be determined. " + support_msg('pytools'))
if isMinVersion(spark_version, 1.4):
json = sqlContext.read.json(jsonFile)
json.write.parquet(parquetDir)
else:
log.warn('running legacy code for Spark <= 1.3')
json = sqlContext.jsonFile(jsonFile)
json.saveAsParquetFile(parquetDir)
示例3: run
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
def run(self):
jsonFile = self.options.jsonFile
parquetDir = self.options.parquetDir
if not jsonFile:
self.usage('--json not defined')
if not parquetDir:
self.usage('--parquetDir not defined')
if self.args:
self.usage()
conf = SparkConf().setAppName('HS PySpark JSON => Parquet')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
spark_version = sc.version
log.info('Spark version detected as %s' % spark_version)
if not isVersionLax(spark_version):
die("Spark version couldn't be determined. " + support_msg('pytools'))
if isMinVersion(spark_version, 1.4):
json = sqlContext.read.json(jsonFile)
json.write.parquet(parquetDir)
else:
log.warn('running legacy code for Spark <= 1.3')
json = sqlContext.jsonFile(jsonFile)
json.saveAsParquetFile(parquetDir)
示例4: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
def main(self, sc, *args):
from pyspark.sql.types import BooleanType, StringType
from pyspark.sql.types import FloatType, StructField, StructType
from pyspark.sql import SQLContext
fields = []
for field in header_avro["fields"] + self.extra_fields:
if field["type"] == "float":
field_type = FloatType()
elif field["type"] == "bool":
field_type = BooleanType()
else:
field_type = StringType()
fields.append(StructField(field["name"], field_type))
schema = StructType(fields)
sqlContext = SQLContext(sc)
logger.info("Reading %s from %s" % (self.test_name, self.input().path))
df = sqlContext.jsonFile(self.input().path, schema)
df.registerTempTable("reports")
entries = df.filter("({test_names}) AND"
" record_type = 'entry'".format(
test_names=' OR '.join([
"test_name = '{test_name}'".format(
test_name=tn)
for tn in self.test_names])))
interestings = self.find_interesting(entries)
out_file = self.output().open('w')
for interesting in interestings.toJSON().collect():
out_file.write(interesting)
out_file.write("\n")
out_file.close()
示例5: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
def main(sc):
path = "events"
#text_file = sc.textFile(path)
sqlContext = SQLContext(sc)
events = sqlContext.jsonFile(path)
events = events.select(events["events.event"]).flatMap(lambda p: p.event)
events = events.map(lambda p: Row(
id=p.id,\
title=p.title, \
lat=p.latitude, \
long=p.longitude, \
postal_code=p.postal_code, \
start_time=datetime.strptime(p.start_time, "%Y-%m-%d %H:%M:%S"), \
stop_time=p.stop_time))
events_df = sqlContext.createDataFrame(events)
events_df.registerTempTable("events")
sqlContext.registerFunction("to_hour", lambda x: x.hour)
sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day) + "-" + str(x.year))
e = sqlContext.sql("select title, str_date(start_time) as event_date,
to_hour(start_time) as hour, postal_code from events where postal_code is not null and start_time is not null")
events_grouped = sqlContext.sql("select event_date, hour, postal_code,
count(*) from events_filtered group by event_date,hour,postal_code order by postal_code,hour")
grouped_csv = events_grouped.map(toCSV)
grouped_csv.saveAsTextFile('events_cluster')
示例6: selectJson
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
def selectJson(sc, columns, filePath):
sqlContext = SQLContext(sc)
if columns[0] == '*':
df = sqlContext.jsonFile(filePath)
# displays the content of the DataFrame to stdout
df.show()
else:
df = sqlContext.load(filePath, "json")
df.select(columns).show()
示例7: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
def main(sc):
sqlContext = SQLContext(sc)
df = sqlContext.jsonFile(DATA_PATH)
#add the filter file
sc.addFile(FILTER_TERMS_FILE_PATH)
filter_terms = sc.textFile(SparkFiles.get("freebase-symptoms-just-terms.txt"))
global filter_terms_set_bc
filter_terms_set_bc = sc.broadcast(Set(filter_terms.collect()))
# Register the DataFrame as a table.
df.registerTempTable("tweet")
results = sqlContext.sql("SELECT id,user.id,user.lang,created_at, coordinates,text FROM tweet where user.lang='en'")
#filter tweets to find health related tweets
filter_health_tweets = results.rdd.filter(healthFilter)
filter_health_tweets.mapPartitions(writeRecords).saveAsTextFile("output/")
示例8: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
def main():
conf = SparkConf().setAppName("pyspark test")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
config = ConfigParser.ConfigParser()
config.read('configuration.cfg')
mongodb_connection = config.get('BatchProperties', 'URLMongoDB')
#######################################################
# UTILIZACION DE LA LIBRERIA DE PYMONGO
#######################################################
client = MongoClient()
db = client.test
cursor = db.tabla1.find()
for document in cursor:
print(document)
#######################################################
# UTILIZACION DE LA LIBRERIA DE pymongo_spark
#######################################################
# Lectura de una tabla de mongodb (db: test; coleccion: tabla1)
rdd = sc.mongoRDD(mongodb_connection + 'test.tabla1')
# Guardamos el rdd leido en mongodb (db: test; coleccion: tabla2)
rdd.saveToMongoDB(mongodb_connection + 'test.tabla2')
# Recuperamos el valor de raiz del proyecto
BASE_DIR = os.path.dirname(os.path.dirname(__file__))
# BASE_DIR = /Users/akash/PycharmProjects/masterbigdata
# Leemos un fichero de ejemplo
file = os.path.join(BASE_DIR + '/datasets/batch/air', 'ficheroSalidaAire.txt')
rddfFile = sqlContext.jsonFile(file)
# Almancemos en mongodb el fichero
rddfFile.saveToMongoDB(mongodb_connection + 'test.tabla3')
示例9: run
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
def run(self):
json_file = self.get_opt('json')
parquet_dir = self.get_opt('parquet_dir')
# let Spark fail if csv/parquet aren't available
# can't check paths exist as want to remain generically portable
# to HDFS, local filesystm or any other uri scheme Spark supports
log.info("Json Source: %s" % json_file)
log.info("Parquet Destination: %s" % parquet_dir)
conf = SparkConf().setAppName('HS PySpark JSON => Parquet')
sc = SparkContext(conf=conf) # pylint: disable=invalid-name
sqlContext = SQLContext(sc) # pylint: disable=invalid-name
spark_version = sc.version
log.info('Spark version detected as %s' % spark_version)
if not isVersionLax(spark_version):
die("Spark version couldn't be determined. " + support_msg('pytools'))
if isMinVersion(spark_version, 1.4):
df = sqlContext.read.json(json_file) # pylint: disable=invalid-name
df.write.parquet(parquet_dir)
else:
log.warn('running legacy code for Spark <= 1.3')
df = sqlContext.jsonFile(json_file) # pylint: disable=invalid-name
df.saveAsParquetFile(parquet_dir)
示例10: get_language_correlation
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
def get_language_correlation():
"""
calculates the correlation between github languages
"""
#Create Spark Context
sc = SparkContext(appName="LanguageCorrelations")
#Create SQL Context
sqlCtx = SQLContext(sc)
#Create a schemaRDD from json datasets stored in HDFS
pushes = sqlCtx.jsonFile('git_14_15/git_results')
#Register the schemaRDD as a Table
pushes.registerTempTable('pushes')
#filter the data to get the pushes for the languages from LANG
filtered = sqlCtx.sql('select * from pushes where repository_language in ' + str(tuple(LANG)))
#perform map transformation to get the rdd in the format (actor, {lang : pushes})
f_pair = filtered.map(lambda s: (s.actor, {s.repository_language:s.pushes}))
#group the RDD's based on actor to get the RDD of the format (actor, [{lang1 : pushes},{lang2 : pushes}...])
f_group = f_pair.groupByKey()
#merge lang dictionries to get single orderd dict per actor
f_merged = f_group.map(lambda s: merge_lang_dict(s[1]))
#created rdd of vectors from the pushes values, which is required for the correlation algorithm
vectors = f_merged.map(lambda s: Vectors.dense(map(float, s.values())))
#call the correlation function
matrix = Statistics.corr(vectors)
print matrix
plot_graph(matrix)
sc.stop()
示例11: SparkConf
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
master_public_dns = "'ec2-52-34-253-146.us-west-2.compute.amazonaws.com"
worker_public_dns = ['52.34.253.146', '52.27.28.14', '52.35.88.14', '52.32.240.173', '52.88.31.138']
conf = SparkConf().setAppName("hawkeye")
sc = SparkContext(conf=conf)
##sc = SparkContext("spark://" + master_ip + ":7077", "hawkeye")
from cassandra.cluster import Cluster
if USE_OLD_CLUSTER == 1:
cluster = Cluster(worker_public_dns)
else:
cluster = Cluster(worker_public_dns)
session = cluster.connect(CASSANDRA_KEYSPACE)
sqlsc = SQLContext(sc)
hemsgs = sqlsc.jsonFile("hdfs://" + master_public_dns + ":9000/camus/topics/" + KAFKA_TOPIC + "/*/*/*/*/*/*")
#hemsgs.count(); hemsgs.take(1)
# Row({
# "tsIn": 1454556923889,
# "tsOut": 1454556983787,
# "packetID": "PACKET19083",
# "monitorGroup": [
# {"type": "I", "subgroup": "TASKID", "id": "TASKID492", "power": "1"},
# {"type": "T", "subgroup": "TASKTYPE", "id": "TASKTYPE69", "power": "2"},
# {"type": "I", "subgroup": "SWID", "id": "SWID6", "power": "3"},
# {"type": "T", "subgroup": "SWTYPE", "id": "mysql", "power": "4"},
# {"type": "I", "subgroup": "APPID", "id": "hawkeye", "power": "5"},
# {"type": "T", "subgroup": "APPTYPE", "id": "APP", "power": "6"}
# ]
# })
示例12: eval
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
model = pickle.load(open('data/classifier.pkl', 'r'))
model_b = sc.broadcast(model)
fashion.map(lambda x: eval(x)['reviewText']).map(lambda x: (x, model_b.value.predict([x])[0])).first()
################################### Spark DataFrame API and Spark SQL ###################################
# Part 5 : Loading data to spark
# We start by loading the files to spark
# First, load them as text file to validate
review_filepaths = 'Data/Reviews/*'
textRDD = sc.textFile(review_filepaths)
print 'number of reviews : {0}'.format(textRDD.count())
print 'sample row : \n{0}'.format(textRDD.first())
# You can let spark infer the schema of your DataFrame
inferredDF = sqc.jsonFile(review_filepaths)
inferredDF.first()
# Or you can programmatically tell spark how the schema looks like
# Define Schema
REVIEWS_SCHEMA_DEF = StructType([
StructField('reviewerID', StringType(), True),
StructField('asin', StringType(), True),
StructField('reviewerName', StringType(), True),
StructField('helpful', ArrayType(
IntegerType(), True),
True),
StructField('reviewText', StringType(), True),
StructField('reviewTime', StringType(), True),
StructField('overall', DoubleType(), True)
])
示例13: PSparkContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
#.........这里部分代码省略.........
mynames = names
else:
# In the future we could avoid this expensive call.
first_line = self.spark_ctx.textFile(file_path).first()
frame = pandas.read_csv(sio(first_line), **kwargs)
# pylint sees frame as a tuple despite it being a DataFrame
mynames = list(frame.columns)
_skiprows += 1
# Do the actual load
if use_whole_file:
return self.from_pandas_rdd(
self.spark_ctx.wholeTextFiles(file_path)
.mapPartitionsWithIndex(csv_file))
else:
return self.from_pandas_rdd(
self.spark_ctx.textFile(file_path)
.mapPartitionsWithIndex(csv_rows))
def parquetFile(self, *paths):
"""Loads a Parquet file, returning the result as a L{DataFrame}.
Parameters
----------
paths: string, variable length
The path(s) of the parquet files to load. Should be Hadoop style
paths (e.g. hdfs://..., file://... etc.).
Returns
-------
A L{DataFrame} of the contents of the parquet files.
"""
return self.from_spark_rdd(self.sql_ctx.parquetFile(paths))
def jsonFile(self, path, schema=None, sampling_ratio=1.0):
"""Loads a text file storing one JSON object per line as a
L{DataFrame}.
Parameters
----------
path: string
The path of the json files to load. Should be Hadoop style
paths (e.g. hdfs://..., file://... etc.).
schema: StructType, optional
If you know the schema of your input data you can specify it. The
schema is specified using Spark SQL's schema format. If not
specified will sample the json records to determine the schema.
Spark SQL's schema format is documented (somewhat) in the
"Programmatically Specifying the Schema" of the Spark SQL
programming guide at: http://bit.ly/sparkSQLprogrammingGuide
sampling_ratio: int, default=1.0
Percentage of the records to sample when infering schema.
Defaults to all records for safety, but you may be able to set to
a lower ratio if the same fields are present accross records or
your input is of sufficient size.
Returns
-------
A L{DataFrame} of the contents of the json files.
"""
schema_rdd = self.sql_ctx.jsonFile(path, schema, sampling_ratio)
return self.from_spark_rdd(schema_rdd)
def from_pd_data_frame(self, local_df):
"""Make a Sparkling Pandas dataframe from a local Pandas DataFrame.
The intend use is for testing or joining distributed data with local
data.
The types are re-infered, so they may not match.
Parameters
示例14: show
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
from pyspark import SparkContext
from pyspark.mllib.feature import HashingTF
from numpy import array
from pyspark.mllib.clustering import KMeans
import sys
def show(x):
print x
if len(sys.argv) != 2:
print >> sys.stderr, "Usage: handsOn4.py <filename>"
exit(-1)
sc= SparkContext()
sqlContext=SQLContext(sc)
tweets=sqlContext.jsonFile(sys.argv[1])
tweets.foreach(show)
tweets.registerTempTable("pt")
words=sqlContext.sql("select hashtags from pt where hashtags is not null")
words.foreach(show)
wordsArray=words.map(lambda x:array(x[0]))
hashingTF=HashingTF()
tf=hashingTF.transform(wordsArray)
tf.foreach(show)
show("Executing Kmeans")
clusters = KMeans.train(tf,2,1,1)
results= wordsArray.map(lambda x:array([x,clusters.predict(hashingTF.transform(x))]))
results.foreach(show)
示例15: enumerate
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
# fx_yprime = self.all_data[y][i] #self.get_feats(self.h_tuples[i][0], t)
# dot_vector = numpy.dot(numpy.array(fx_yprime), self.param)
# numerator = math.exp(dot_vector)
# prob = numerator / denominator
# for j,val in enumerate(all_sum):
# all_sum[j] += prob * fx_yprime[j]
if __name__ == '__main__':
features = [f1,f2,f3,f4,f5,f6,f7,f8,f9,f10, f11, f12, f13,f14]
# features = [f1,f2,f3]
input_data,tags = create_input_dataset()
distributed_input_data = sqlContext.jsonFile('data.json')
gradient_preprocess1(input_data,list(set(tags)))
# print distributed_input_data.show()
all_tags = sc.broadcast(list(set(tags)))
no_of_features = sc.broadcast(len(features))
size = sc.broadcast(len(input_data))
param = [0 for i in range(len(features))]
# gradient1_new(param)
# param = [1 for i in range(len(features))]
# gradient1_new(param)
dt1 = datetime.datetime.now()
print 'before training: ', dt1
params = mymin(cost1, param, method = 'L-BFGS-B', jac = gradient1_new, options = {'maxiter':100}) #, jac = self.gradient) # , options = {'maxiter':100}
print params.x
print params