本文整理汇总了Python中pyspark.sql.SQLContext.load方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.load方法的具体用法?Python SQLContext.load怎么用?Python SQLContext.load使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.SQLContext
的用法示例。
在下文中一共展示了SQLContext.load方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: save_pslist
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import load [as 别名]
def save_pslist(sContext):
volatility = utils.SparkVolatility('pslist')
sqlContext = SQLContext(sContext)
df = sqlContext.load('Volatility/imageinfo')
sqlContext.registerDataFrameAsTable(df, "imageinfo")
imginfo = sqlContext.sql("SELECT * FROM imageinfo")
pslist = imginfo.map(volatility.Execute)
pslist.cache()
df = pslist.toDF()
df.save('Volatility/pslist', 'parquet', 'append')
示例2: selectJson
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import load [as 别名]
def selectJson(sc, columns, filePath):
sqlContext = SQLContext(sc)
if columns[0] == '*':
df = sqlContext.jsonFile(filePath)
# displays the content of the DataFrame to stdout
df.show()
else:
df = sqlContext.load(filePath, "json")
df.select(columns).show()
示例3: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import load [as 别名]
def main(self, sc, *args):
logging.info("=======SPARK JOB=========")
sqlContext = SQLContext(sc)
df = (sqlContext.load(source="jdbc",
url="jdbc:postgresql://localhost:2222/mydatabase?user=dbuser&password=dbpassword",
dbtable="tablename"))
print(df.show())
logging.info(df.printSchema())
with self.output().open('w') as outFile:
outFile.write(str(result))
示例4: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import load [as 别名]
def main():
cleanup()
sc = SparkContext()
sql_context = SQLContext(sc)
path = os.path.join(mysql_export_dir, "vernacular_string_indices.tsv")
df = sql_context.load(source='com.databricks.spark.csv', header='true', inferSchema='true', path=path,
quote="\u0000", delimiter="\t")
df.rdd \
.map(extract_vernacular_strings_fields) \
.distinct() \
.saveAsTextFile(vernacular_string_indices_output)
示例5: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import load [as 别名]
def main():
cleanup()
sc = SparkContext()
sql_context = SQLContext(sc)
path = os.path.join(mysql_export_dir, "name_strings.tsv")
df = sql_context.load(source='com.databricks.spark.csv', header='true', inferSchema='true', path=path,
quote="\u0000", delimiter="\t")
names = df.rdd.map(lambda x: x["name"])
name_strings = parse_spark(sc, names) \
.map(lambda result: json.loads(result)) \
.map(extract_name_strings_fields) \
.distinct()
name_strings.saveAsTextFile(name_strings_output)
示例6: travelTime
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import load [as 别名]
def travelTime(startInt, endInt, routeIndex):
# Get route info
# [route_short_name] [start_lat] [start_lon] [end_lat] [end_lon]
curs.execute(BIXISQL.getRouteRow(routeIndex))
route = curs.fetchall()
if route[0][0] != 24: return # REMOVE
# Construct SQL Query for ttc raw data table
initQuery = BIXISQL.getTTCRawRouteSQL(tableName, startInt, endInt, route[0][0])
#print "Initial Query: " + initQuery
# Start Spark SQL Context
sc = SparkContext("local", "BIXIData")
sqlContext = SQLContext(sc)
# Get tables from SQL Database
#print "BQ"
ttcRawTable = sqlContext.load(None, "jdbc", None, url=BIXISQL.getSrcUrl(), dbtable=initQuery, driver=BIXISQL.getDriverName())
sqlContext.registerDataFrameAsTable(ttcRawTable, "rawData")
#print ttcRawTable.count()
if ttcRawTable.count() < 1:
sc.stop()
return
#print "AQ"
#routeTable = sqlContext.load(None, "jdbc", None, url=urlDest, dbtable=routeQuery, driver=driverName)
# change into accessible array
#route = routeTable.collect()
#idList = sqlContext.sql("SELECT DISTINCT(vehicle_id) FROM rawData").sample(False, sampleRate).collect()
#print "idList: " + str(len(idList)) + " [" + str(route[0][0]) + "]"
#for row in idList:
#print row
# print "vehicle_id: " + str(row.vehicle_id)
# tempTable = sqlContext.sql("SELECT dateTime, dirTag FROM rawData WHERE vehicle_id=" + str(row.vehicle_id))
# print "Count: " + str(tempTable.count())
# print "start: "
# tempTable.sort(asc('dateTime')).show(n=1)
# print "end: "
# tempTable.sort(desc('dateTime')).show(n=1)
curTime = startInt
#print "route: " + str(route[i].route_short_name)
# Get the upper and lower bounds for the start location of the route
#startLatUpper = round(float(str(route[0][1])), Prec) + Tol
#startLatLower = round(float(str(route[0][1])), Prec) - Tol
#startLonUpper = round(float(str(route[0][2])), Prec) + Tol
#startLonLower = round(float(str(route[0][2])), Prec) - Tol
#endLatUpper = round(float(str(route[0][3])), Prec) + Tol
#endLatLower = round(float(str(route[0][3])), Prec) - Tol
#endLonUpper = round(float(str(route[0][4])), Prec) + Tol
#endLonLower = round(float(str(route[0][4])), Prec) - Tol
#print "start: " + str(startLatLower) + " " + str(startLatUpper) + " " + str(startLonLower) + " " + str(startLonUpper)
#print "end: " + str(endLatLower) + " " + str(endLatUpper) + " " + str(endLonLower) + " " + str(endLonUpper)
# Select a sample list of bus ids
idList = sqlContext.sql("SELECT nbBikes FROM rawData WHERE dateTime>='" + str(startInt) + "' AND dateTime<'" + str(startInt + timeInt) + "' ORDER BY nbBikes ASC").limit(maxSampleSize)
sqlContext.registerDataFrameAsTable(idList, "idTable")
curTime = startInt + timeInt
while curTime < endInt:
temp = sqlContext.sql("SELECT nbBikes FROM rawData WHERE dateTime>='" + str(curTime) + "' AND dateTime<'" + str(curTime + timeInt) + "' ORDER BY nbBikes").limit(maxSampleSize)
idList = idList.unionAll(temp)
curTime += timeInt
idList.show()
idList = idList.distinct().collect()
# Loop through bus id list to calculate travel time
print "Route: " + str(route[0][0])
trvSchm = ['startTime', 'trvTime']
trvTimeList = sqlContext.createDataFrame([('00:00:00',0)], trvSchm).limit(0)
#newRow = sqlContext.createDataFrame([('00:00:01',1)], schema)
#trvTimeList = trvTimeList.unionAll(newRow)
for busrow in idList:
print busrow.nbBikes
temp = sqlContext.sql("SELECT dateTime FROM rawData WHERE station_id=" + str(busrow.station_id) + " ORDER BY dateTime ASC").collect()
rangeSize = len(temp)
print str(temp[0].dateTime) + " " + str(temp[0].dirTag)
print "List Size: " + str(rangeSize)
trvStart = temp[0].dateTime
trvCount = 0
trvSum = 0
trvInt = int(trvStart.hour / timeIntHr) * timeIntHr
for i in range(1, rangeSize):
#print temp[i]
if temp[i].dirTag != temp[i-1].dirTag:
trvEnd = temp[i-1].dateTime #DT.datetime.strptime(temp[i-1].dateTime, "%Y-%m-%d %H:%M:%S")
tempTrip = (trvEnd - trvStart).total_seconds() / 60 # caculate travel time in minutes
if tempTrip > minTravel:
trvSum += tempTrip
trvCount += 1
#trvInt = int(trvStart.hour / timeIntHr) * timeIntHr
#newRow = sqlContext.createDataFrame([(trvInt, int(trvSum / trvCount))], trvSchm)
#trvTimeList = trvTimeList.unionAll(newRow)
#print "new: " + str(trvStart.hour) + " " + str(trvInt) + " " + str(tempTrip)
trvStart = temp[i].dateTime
if (int(trvStart.hour / timeIntHr) * timeIntHr != trvInt) and (trvCount != 0):
# print "trvInt: " + str(trvInt) + " " + str(trvStart.hour / timeIntHr)
#.........这里部分代码省略.........
示例7: SparkContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import load [as 别名]
import numpy as np
import random
import sys
from pyspark import SparkContext
from pyspark.sql import SQLContext
#get spark context
sc = SparkContext(appName="APProject")
#get SQL context
sqlContext = SQLContext(sc)
#first argument is data file name
dataFilePath=sys.argv[1]
#dataFilePath="train.csv"
#create data frame from the given CSV file
trainDataFrame = sqlContext.load(source="com.databricks.spark.csv", header="true", path = dataFilePath)
trainDataFrame.registerTempTable("trainDataFrame")
#input: point and array of known centroids
#output: if given point is centroid returns centroid name else returns "Not Centroid"
def isCentroid(point, centroidArray):
i=1
for centroid in centroidArray:
if(("%.6f" %point[0]=="%.6f" %centroid[1][0]) & ("%.6f" %point[1]=="%.6f" %centroid[1][1])):
return "C"+str(i)
i=i+1
return "Not Centroid"
#input: point and array of known centroids
#output: if point is not centroid then calculate centroid and return the name else if point is centroid return its name
def getCentroid(point, centroidArray):
示例8: open
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import load [as 别名]
#!/usr/bin/python
import os
import yaml
with open('../config.yaml', 'r') as f:
conf = yaml.load(f)
user = conf['psql']['user']
os.environ["SPARK_HOME"] = conf['psql']['sparkHome']
os.environ["SPARK_CLASSPATH"] = conf['psql']['sparkClass']
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
sc = SparkContext("local", "Simple App")
sqlContext = SQLContext(sc)
df = sqlContext.load(
url = "jdbc:postgresql://localhost/%s" % (user),
dbtable = "iraq",
password = "",
user = user,
source = "jdbc",
driver = "org.postgresql.Driver"
)
print('\n\n\nJob Started!')
df.show()
print('Job Finished!\n\n\n')
sc.stop()
示例9: SparkContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import load [as 别名]
startTime = DT.datetime.strptime(sys.argv[1], "%Y-%m-%d %H:%M:%S")
endTime = DT.datetime.strptime(sys.argv[2], "%Y-%m-%d %H:%M:%S")
station_id = sys.argv[3]
fileName = sys.argv[4]
driverName = environ["jdbcDriverName"]
dbSrcUrl = environ["mysql_srcUrl"]
dbDestUrl = environ["mysql_destUrl"]
# Start Spark SQL Context
sc = SparkContext("local", "TTCData")
sqlContext = SQLContext(sc)
# Query lat and lon data for specified time period and routeTag
initQuery = "(SELECT dateTime, nbBikes, lat, lon FROM BIXI WHERE dateTime>='" + str(startTime) + "' AND dateTime<'" + str(endTime) + \
"' AND station_id=" + str(station_id) + ") AS T"
routeTable = sqlContext.load(None, "jdbc", None, url=dbSrcUrl, dbtable=initQuery, driver=driverName)
sqlContext.registerDataFrameAsTable(routeTable, "rawData")
# Query route stop lat and lon
initQuery = "(SELECT start_lat, start_lon FROM BIXI_ROUTES WHERE route_short_name=" + str(routeTag) + ") AS T"
routeLoc = sqlContext.load(None, "jdbc", None, url=dbDestUrl, dbtable=initQuery, driver=driverName).collect()
# Calculate lat and lon ranges
startLatUpper = round(float(str(routeLoc[0].start_lat)), Prec) + Tol
startLatLower = round(float(str(routeLoc[0].start_lat)), Prec) - Tol
startLonUpper = round(float(str(routeLoc[0].start_lon)), Prec) + Tol
startLonLower = round(float(str(routeLoc[0].start_lon)), Prec) - Tol
# Loop through at time interval
curTime = startTime
freqList = []
示例10: SparkConf
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import load [as 别名]
from pyspark.sql import SQLContext
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("Cloudant Spark SQL External Datasource in Python")
# define cloudant related configuration:
# set protocol to http if needed, default value = https
# conf.set("cloudant.protocol","http")
conf.set("cloudant.host","ACCOUNT.cloudant.com")
conf.set("cloudant.username", "USERNAME")
conf.set("cloudant.password","PASSWORD")
conf.set("jsonstore.rdd.maxInPartition",1000)
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
df = sqlContext.load("n_airportcodemapping", "com.cloudant.spark")
# In case of doing multiple operations on a dataframe (select, filter etc.)
# you should persist the dataframe.
# Othewise, every operation on the dataframe will load the same data from Cloudant again.
# Persisting will also speed up computation.
df.cache() # persisting in memory
# alternatively for large dbs to persist in memory & disk:
# from pyspark import StorageLevel
# df.persist(storageLevel = StorageLevel(True, True, False, True, 1))
df.printSchema()
df.filter(df.airportName >= 'Moscow').select("_id",'airportName').show()
df.filter(df._id >= 'CAA').select("_id",'airportName').show()
示例11: StructType
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import load [as 别名]
students_df.groupBy("degree").max("grade").collect()
students_df.groupBy("degree").max("grade").show()
from pyspark.sql.types import *
schema = StructType([
StructField("id", LongType(), True),
StructField("name", StringType(), True),
StructField("grade", DoubleType(), True),
StructField("degree", StringType(), True) ])
students_df = sqlCtx.createDataFrame(students, schema)
students_json = [ '{"id":100, "name":"Alice", "grade":8.5, "degree":"Computer Science"}', '{"id":101, "name":"Bob", "grade":7.1, "degree":"Engineering"}']
with open("students.json", "w") as f:
f.write("\n".join(students_json))
sqlCtx.jsonFile("students.json").show()
yelp_df = sqlCtx.load(source="com.databricks.spark.csv",header = 'true',
inferSchema = 'true',path = '/usr/lib/hue/apps/search/examples/collections/solr_configs_yelp_demo/index_data.csv')
yelp_df.printSchema()
yelp_df.filter(yelp_df.useful >= 1).count()
yelp_df.filter(yelp_df["useful"] >= 1).count()
yelp_df.filter("useful >= 1").count()
yelp_df.select("useful")
yelp_df.select("useful").agg({"useful":"max"}).collect()
yelp_df.select("id", "useful").take(5)
yelp_df.select("id", yelp_df.useful/28*100).show(5)
yelp_df.select("id", (yelp_df.useful/28*100).cast("int")).show(5)
useful_perc_data = yelp_df.select("id", (yelp_df.useful/28*100).cast("int"))
useful_perc_data.columns
useful_perc_data = yelp_df.select(yelp_df["id"].alias("uid"),(yelp_df.useful/28*100).cast("int").alias("useful_perc"))
from pyspark.sql.functions import asc, desc
示例12: run
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import load [as 别名]
def run(self):
csv_file = self.options.csv
parquet_dir = self.options.parquet_dir
has_header = self.options.has_header
# I don't know why the Spark guys made this a string instead of a bool
header_str = 'false'
if has_header:
header_str = 'true'
schema = self.options.schema
# let Spark fail if csv/parquet aren't available
# can't check paths exist as want to remain generically portable
# to HDFS, local filesystm or any other uri scheme Spark supports
log.info("CSV Source: %s" % csv_file)
log.info("Parquet Destination: %s" % parquet_dir)
if schema:
def get_type(arg):
arg = str(arg).lower()
if arg not in self.types_mapping:
self.usage(
"invalid type '%s' defined in --schema, must be one of: %s"
% (arg, ', '.join(sorted(self.types_mapping.keys()))))
# return self.types_mapping[arg]
module = __import__('pyspark.sql.types', globals(), locals(),
['types'], -1)
class_ = getattr(module, self.types_mapping[arg])
_ = class_()
return _
def create_struct(arg):
name = arg
data_type = 'string'
if ':' in arg:
(name, data_type) = arg.split(':', 1)
data_class = get_type(data_type)
return StructField(name, data_class, True)
# see https://github.com/databricks/spark-csv#python-api
self.schema = StructType(
[create_struct(_) for _ in schema.split(',')])
log.info('generated CSV => Spark schema')
conf = SparkConf().setAppName('HS PySpark CSV => Parquet')
sc = SparkContext(conf=conf) # pylint: disable=invalid-name
sqlContext = SQLContext(sc) # pylint: disable=invalid-name
spark_version = sc.version
log.info('Spark version detected as %s' % spark_version)
if not isVersionLax(spark_version):
die("Spark version couldn't be determined. " +
support_msg('pytools'))
# pylint: disable=invalid-name
df = None
if isMinVersion(spark_version, 1.4):
if has_header and not schema:
log.info('inferring schema from CSV headers')
df = sqlContext.read.format('com.databricks.spark.csv')\
.options(header=header_str, inferschema='true')\
.load(csv_file)
else:
log.info('using explicitly defined schema')
df = sqlContext.read\
.format('com.databricks.spark.csv')\
.options(header=header_str)\
.load(csv_file, schema=self.schema)
df.write.parquet(parquet_dir)
else:
log.warn('running legacy code for Spark <= 1.3')
if has_header and not schema:
log.info('inferring schema from CSV headers')
df = sqlContext.load(
source="com.databricks.spark.csv",
path=csv_file,
header=header_str,
inferSchema='true')
elif self.schema:
log.info('using explicitly defined schema')
schema = self.schema
df = sqlContext.load(
source="com.databricks.spark.csv",
path=csv_file,
header=header_str,
schema=schema)
else:
die('no header and no schema, caught late')
df.saveAsParquetFile(parquet_dir)
示例13: SparkContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import load [as 别名]
# Initialize Variables
startTime = DT.datetime.strptime(sys.argv[1], "%Y-%m-%d %H:%M:%S")
endTime = DT.datetime.strptime(sys.argv[2], "%Y-%m-%d %H:%M:%S")
fileName = sys.argv[3]
driverName = environ["jdbcDriverName"]
dbSrcUrl = environ["mysql_srcUrl"]
dbDestUrl = environ["mysql_destUrl"]
# Start Spark SQL Context
sc = SparkContext("local", "BIXIData")
sqlContext = SQLContext(sc)
# Query lat and lon data
initQuery = "(SELECT lat, lon FROM BIXI_STATIONS WHERE dateTime>='" + str(startTime) + "' AND dateTime<'" + str(endTime) + "') AS T"
pointTable = sqlContext.load(None, "jdbc", None, url=dbSrcUrl, dbtable=initQuery, driver=driverName).sample(False, sampleRate)
# Round lat and lon data to precision <Prec>
roundCol = udf(lambda a: str(round(a, Prec)), StringType())
results = pointTable.withColumn('latR', roundCol(pointTable.lat)).withColumn('lonR', roundCol(pointTable.lon)).select('latR', 'lonR')
# Group lat and lon data and count within each group, pick the top <maxSampleSize>*2 groups with the highest bus count
temp = results.groupBy(results.latR, results.lonR).count().sort(desc('count')).limit(maxSampleSize * 2).sort(asc('latR'), asc('lonR')).collect()
# Group points that are close to each other
# based on comparing euclidean distance between lat and lon pairs to the tolerance <Tol>
counter = 0
pointList = []
point = {}
for i in range (0, len(temp)):
示例14: Input
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import load [as 别名]
# Verify Input (max 12-hr interval)
#if (endTime - startTime) > maxInt:
# endTime = startTime + maxInt
# Get route info
# Construct SQL Query for ttc raw data table
initQuery = "(SELECT * FROM BIXI WHERE dateTime >= '" + str(startTime) + "' " + \
"AND dateTime < '" + str(endTime) + "' AND station_id = " + str(station_id) + ") AS T"
#print "Initial Query: " + initQuery
# Start Spark SQL Context
sc = SparkContext("local", "BIXIData")
sqlContext = SQLContext(sc)
# Get tables from SQL Database
bixiRawTable = sqlContext.load(None, "jdbc", None, url=dbSrcUrl, dbtable=initQuery, driver=driverName)
sqlContext.registerDataFrameAsTable(bixiRawTable, "rawData")
if ttcRawTable.count() < 1:
sc.stop()
print "[BIXITravel.py] ERROR: Query returned empty table, no matching records"
sys.exit()
# Select a sample list of bus ids (one id per interval)
idList = sqlContext.sql("SELECT DISTINCT(nbBikes) FROM rawData WHERE dateTime>='" + str(startTime) + "' AND dateTime<'" + str(startTime + timeInt) + "' LIMIT " + str(maxSampleSize))
curTime = startTime + timeInt
while curTime < endTime:
temp = sqlContext.sql("SELECT DISTINCT(nbBikes) FROM rawData WHERE dateTime>='" + str(curTime) + "' AND dateTime<'" + str(curTime + timeInt) + "' LIMIT " + str(maxSampleSize))
idList = idList.unionAll(temp)
#print "ID COUNT: " + str(idList.count())
示例15: SparkConf
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import load [as 别名]
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#******************************************************************************/
import pprint
from pyspark.sql import SQLContext
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("Cloudant Spark SQL External Datasource in Python")
# define coudant related configuration
conf.set("cloudant.host","ACCOUNT.cloudant.com")
conf.set("cloudant.username", "USERNAME")
conf.set("cloudant.password","PASSWORD")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
df = sqlContext.load("airportcodemapping", "com.cloudant.spark")
df.printSchema()
df.filter(df.airportCode >= 'CAA').select("airportCode",'airportName').show()
df.filter(df.airportCode >= 'CAA').select("airportCode",'airportName').save("airportcodemapping_df", "com.cloudant.spark")