本文整理汇总了Python中pyspark.sql.SQLContext.registerFunction方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.registerFunction方法的具体用法?Python SQLContext.registerFunction怎么用?Python SQLContext.registerFunction使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.SQLContext
的用法示例。
在下文中一共展示了SQLContext.registerFunction方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
def main(sc):
path = "events"
#text_file = sc.textFile(path)
sqlContext = SQLContext(sc)
events = sqlContext.jsonFile(path)
events = events.select(events["events.event"]).flatMap(lambda p: p.event)
events = events.map(lambda p: Row(
id=p.id,\
title=p.title, \
lat=p.latitude, \
long=p.longitude, \
postal_code=p.postal_code, \
start_time=datetime.strptime(p.start_time, "%Y-%m-%d %H:%M:%S"), \
stop_time=p.stop_time))
events_df = sqlContext.createDataFrame(events)
events_df.registerTempTable("events")
sqlContext.registerFunction("to_hour", lambda x: x.hour)
sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day) + "-" + str(x.year))
e = sqlContext.sql("select title, str_date(start_time) as event_date,
to_hour(start_time) as hour, postal_code from events where postal_code is not null and start_time is not null")
events_grouped = sqlContext.sql("select event_date, hour, postal_code,
count(*) from events_filtered group by event_date,hour,postal_code order by postal_code,hour")
grouped_csv = events_grouped.map(toCSV)
grouped_csv.saveAsTextFile('events_cluster')
示例2: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
def main(sc):
sqlContext = SQLContext(sc)
taxiFile = sc.textFile("taxizip/taxizipaa.csv")
header = taxiFile.first()
taxiHeader = taxiFile.filter(lambda l: "vendor_id" in l)
taxiNoHeader = taxiFile.subtract(taxiHeader)
taxi_temp = taxiNoHeader.map(lambda k: k.split(","))
taxi_rdd = taxi_temp.map(lambda p: Row(vendor_id=p[0],
pickup_datetime=datetime.strptime(p[1], "%Y-%m-%d %H:%M:%S"),
dropoff_datetime=datetime.strptime(p[2], "%Y-%m-%d %H:%M:%S"),
passenger_count=int(p[3] if p[3]!="" else 0),
trip_distance=float(p[4] if p[4]!="" else 0),
pickup_longitude=float(p[5] if p[5]!="" else 0) ,
pickup_latitude=float(p[6] if p[6]!="" else 0),
rate_code=p[7],
store_and_fwd_flag=p[8],
dropoff_longitude=float(p[9] if p[9]!="" else 0),
dropoff_latitude=float(p[10] if p[10]!="" else 0),
payment_type=p[11],
fare_amount=float(p[12] if p[12]!="" else 0),
surcharge=float(p[13] if p[13]!="" else 0),
mta_tax=float(p[14] if p[14]!="" else 0),
tip_amount=float(p[15] if p[15]!="" else 0),
tolls_amount=float(p[16] if p[16]!="" else 0),
total_amount=float(p[17] if p[17]!="" else 0),
zipcode=p[18]))
taxi_df = sqlContext.createDataFrame(taxi_rdd)
taxi_df.registerTempTable("taxi")
sqlContext.registerFunction("to_hour", lambda x: x.hour)
sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day) + "-" + str(x.year))
th = sqlContext.sql("SELECT to_hour(dropoff_datetime) as hour, dropoff_datetime as trip_date, dropoff_longitude as lng,dropoff_latitude as lat,zipcode FROM taxi where dropoff_longitude!=0 and dropoff_latitude!=0")
th.registerTempTable("taxi_hr")
sqlContext.cacheTable("taxi_hr")
grouped_taxi = sqlContext.sql("select hour, zipcode,str_date(trip_date), count(*) as c from taxi_hr group by hour,zipcode,str_date(trip_date) order by c desc")
grouped_taxi.show(100)
#save this intermediate result to a file as csv
grouped_csv = grouped_taxi.map(toCSV)
grouped_csv.saveAsTextFile('results')
grouped_taxi.registerTempTable("taxi_grouped")
sqlContext.cacheTable("taxi_grouped")
示例3: SparkConf
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import DoubleType
from com.esri.udt import PointType, PointUDT
if __name__ == "__main__":
conf = SparkConf().setAppName("GDB App")
sc = SparkContext(conf=conf)
try:
sqlContext = SQLContext(sc)
sqlContext.registerFunction("getX", lambda p: p.x, DoubleType())
sqlContext.registerFunction("getY", lambda p: p.y, DoubleType())
sqlContext.registerFunction("plus2", lambda p: PointType(p.x + 2, p.y + 2), PointUDT())
points = "Points"
df_points = sqlContext.read \
.format("com.esri.gdb") \
.options(path="../../test/resources/Test.gdb", name=points, numPartitions="1") \
.load()
df_points.printSchema()
df_points.registerTempTable(points)
rows = sqlContext.sql("select plus2(Shape),X,Y from {}".format(points))
for row in rows.collect():
print row
lines = "Lines"
df_lines = sqlContext.read \
.format("com.esri.gdb") \
示例4: str
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
base_uri = "http://api.wunderground.com/api/{}/history_".format(weath_api_key)
day = str(day)
year = str(year)
ds = day + " " + year
x = datetime.datetime.strptime(ds, "%B %d %Y").date()
date_formated = "".join([str(x.year), lead_zero(str(x.month)), lead_zero(str(x.day))])
location_l = location.split(",")
state = location_l[1].replace(" ", "")
city = location_l[0].replace(" ", "_")
ruri = base_uri + date_formated + "/q/" + state + "/" + city + ".json"
return ruri
#register the function for SQL to use
sqlContext.registerFunction("weather_uri", weather_uri)
# In[32]:
query = """
select
distinct
year,
Day,
city_stadium_map.stadium_city,
weather_uri(city_stadium_map.stadium_city, year, Day) as weather_uri
from scores
inner join city_stadium_map on
scores.stadium = city_stadium_map.stadium
limit 10
示例5: Functions
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
result.show()
# User defined functions
# Spark SQL also provides the functionality similar to User Defined Functions (UDF) offering in Hive.
# Spark uses registerFunction() function to register python functions in SQLContext.
# user defined function
def transform_review(review):
x1 = re.sub('[^0-9a-zA-Z\s]+','',review)
return [x1.lower()]
# register table from above
result.registerAsTable("result")
# register function from above
sqc.registerFunction("to_lowercase", lambda x:transform_review(x),returnType=ArrayType(StringType(), True))
# use the registered function inside SQL
sql_query_transform = """SELECT asin, reviewText, to_lowercase(reviewText) as cleaned
FROM result
"""
result_transform = sqc.sql(sql_query_transform)
result_transform.show()
# FINALLY, Mix and Match!!
# You can also mix DataFrames, RDDs and SparkSQL to make it work for you.
# Scenario:
# We want to investigate the average rating of reviews in terms of the categories they belong to. In order to do this, we:
示例6: SparkContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)
#Create polygon list and broadcast variable based on it
lPolygon = shapeReader.readInShapeJson(shapeFile)
bc_lTargetPolygons = sc.broadcast(lPolygon)
#Read in data, coalesce to limit the number of jobs and avoid shuffling issues later in the job
records = sqlContext.parquetFile(inputFile) if 0 == nDataType else csvToDataFrame(sc,sqlContext,inputFile,nDataType)
if inputPartitions != -1:
records = records.repartition(inputPartitions)
records.cache()
records.registerTempTable('records')
sqlContext.registerFunction("inRegionOfInterest", lambda lat,lon: fspLib.inROI(lat,lon,bc_lTargetPolygons),returnType=BooleanType())
sqlContext.registerFunction("inEventOfInterest", lambda lat,lon,dt: fspLib.inEOI(lat,lon,dt,bc_lTargetPolygons),returnType=BooleanType())
data = sqlContext.sql("SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon) AND inEventOfInterest(records.lat,records.lon,records.dt)")
#Split data into 2 DDSs depending on being in our out of region of interest
rows = data.collect()
if not os.path.isdir('previewTrainingFiles'): os.mkdir('previewTrainingFiles')
fOut = codecs.open('previewTrainingFiles/'+jobNm, encoding="utf-8",mode="wb")
for row in rows:
try:
buffer = [row.lat,row.lon,row.user,row.dt.date(),row.text,row.dt]
buffer = map(lambda x: unicode(x).replace(u'\t',u' ').replace(u'\n',u' '),buffer)
fOut.write(u'\t'.join(buffer)+u'\n')
except:
traceback.print_exc()
示例7: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
def main(sc):
sqlContext = SQLContext(sc)
taxiFile = sc.textFile("taxizip1.csv")
header = taxiFile.first()
fields = [StructField(field_name, StringType(), True) for field_name in header.split(',')]
fields[1].dataType = TimestampType()
fields[2].dataType = TimestampType()
fields[3].dataType = IntegerType()
fields[4].dataType = FloatType()
fields[5].dataType = FloatType()
fields[6].dataType = FloatType()
fields[9].dataType = FloatType()
fields[10].dataType = FloatType()
fields[12].dataType = FloatType()
fields[13].dataType = FloatType()
fields[14].dataType = FloatType()
fields[15].dataType = FloatType()
fields[16].dataType = FloatType()
fields[17].dataType = FloatType()
schema = StructType(fields)
taxiHeader = taxiFile.filter(lambda l: "vendor_id" in l)
taxiNoHeader = taxiFile.subtract(taxiHeader)
taxi_temp = taxiNoHeader.map(lambda k: k.split(","))
taxi_rdd = taxi_temp.map(lambda p: (p[0],
datetime.strptime(p[1], "%Y-%m-%d %H:%M:%S"),
datetime.strptime(p[2], "%Y-%m-%d %H:%M:%S"),
int(p[3] if p[3]!="" else 0),
float(p[4] if p[4]!="" else 0),
float(p[5] if p[5]!="" else 0) ,
float(p[6] if p[6]!="" else 0),
p[7],
p[8],
float(p[9] if p[9]!="" else 0),
float(p[10] if p[10]!="" else 0),
p[11],
float(p[12] if p[12]!="" else 0),
float(p[13] if p[13]!="" else 0),
float(p[14] if p[14]!="" else 0),
float(p[15] if p[15]!="" else 0),
float(p[16] if p[16]!="" else 0),
float(p[17] if p[17]!="" else 0),
p[18] ))
taxi_df = sqlContext.createDataFrame(taxi_rdd, schema)
taxi_df.registerTempTable("taxi")
sqlContext.registerFunction("to_hour", lambda x: x.hour)
sqlContext.registerFunction("to_date", lambda x: x.date())
sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day))
th = sqlContext.sql("SELECT to_hour(dropoff_datetime) as hour, to_date(dropoff_datetime) as trip_date, dropoff_longitude as lng,dropoff_latitude as lat,zipcode FROM taxi where dropoff_longitude!=0 and dropoff_latitude!=0")
th.registerTempTable("taxi_hr")
#test_hr = sqlContext.sql("select hour, count(*) from taxi_hr group by hour,trip_date")
test_hr = sqlContext.sql("select hour, zipcode,trip_date, count(*) as c from taxi_hr group by hour,zipcode,trip_date order by c desc")
示例8: StructField
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
StructField("TRIP_ID", StringType(), True),\
StructField("STOP_ID", StringType(), True),\
StructField("time",StringType(), True)])
real_stoptimes = sqlContext.read.format('com.databricks.spark.csv').options(header='false')\
.load('new_predict.csv', schema = real_stoptimes_schema)
stoptimes = sqlContext.read.format('com.databricks.spark.csv').options(header='false')\
.load('stop_times.txt',schema = stop_times_schema)
new_time = real_stoptimes.withColumn('realtime',split(pyspark.sql.functions.from_unixtime(real_stoptimes.time), ' ')[1])\
.withColumn('date',split(pyspark.sql.functions.from_unixtime(real_stoptimes.time), ' ')[0])
new_time.registerTempTable('new_time')
stoptimes.registerTempTable('stoptimes')
sqlContext.registerFunction("getsec", lambda x: get_sec(x), IntegerType()) #register python function into sql
join = sqlContext.sql('SELECT ROUTE_ID,TRIP_ID,STOP_ID,realtime,date,(getsec(realtime)-getsec(arrival_time)) as delay\
FROM new_time\
INNNER JOIN stoptimes\
ON (TRIP_ID = trip_id AND STOP_ID = stop_id)') # join with GTFS data
join.registerTempTable('new_join')
with open(sys.argv[-2]) as fr: #read sql
query = fr.read()
sqlContext.sql(query)\
.map(lambda x: ",".join(map(str, x)))\
.saveAsTextFile(sys.argv[-1])
示例9: SparkContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
sc = SparkContext(master, app_name)
sql_context = SQLContext(sc)
lines = sc.textFile(input)
parts = lines.map(lambda l: l.split(separator)).filter(lambda x: len(x) == 32)
schema_string = "site_id,site_uuid,site_uuid_ctime,ptitle,url," \
"referrer,prevPID,attime,resolution,ip," \
"ctime,language,cookie_enabled,ua,uuid," \
"uuid_ctime,browser,os,tag_key,supp_id," \
"gw_id,portal_version,from_page,channel_id,channel_list_id," \
"content_id,advid,appid,spenttime,assingleaccess," \
"asfirstaccess,aslastaccess"
fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(separator)]
schema = StructType(fields)
schema_rdd = sql_context.createDataFrame(parts, schema)
schema_rdd.registerTempTable("sitepvv3")
sql_context.registerFunction("to_date", lambda x:Fun().to_date(x), DateType())
sql_context.registerFunction("datediff",lambda x,k:Fun().datediff(x,k),IntegerType())
sql_context.registerFunction("hour",lambda x:Fun().to_hour(x),IntegerType())
sql_context.registerFunction("str_conver_int",lambda x:int(x),IntegerType())
begin_time = dest_time_str
end_time = dest_time_str
Sitepvv3Service().exec_file(sql_context,begin_time,end_time)
sc.stop()
示例10: sum
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
sum(duration) as total_duration\
from interactions\
group by protocol_type").show()
sqlContext.sql("select protocol_type,\
count(*) as cnt,\
avg(duration) as total_duration\
from interactions\
group by protocol_type").show()
#udf in DF
label_fun = udf(lambda x: "normal" if x == "normal." else "attack", StringType())
#udf in sqlContext
sqlContext.registerFunction('strLength', lambda x: len(x))
#add attack column
df2 = interactions_labeled_df.withColumn('attack',label_fun(interactions_labeled_df['label']))
df2.groupBy("attack").count().show()
interactions_labeled_df.registerTempTable("interactions_label")
sqlContext.sql("select label,\
case when label = 'normal.'\
then 'normal'\
else 'attack'\
end as attack\
from interactions_label")\
.groupBy('attack')\
.count()\
示例11: SparkConf
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
# A simple demo for working with SparkSQL and Tweets
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
import json
import sys
if __name__ == "__main__":
inputFile = sys.argv[1]
conf = SparkConf().setAppName("SparkSQLTwitter")
sc = SparkContext()
sqlCtx = SQLContext(sc)
print "Loading tweets from " + inputFile
input = sqlCtx.jsonFile(inputFile)
input.registerTempTable("tweets")
topTweets = sqlCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
print topTweets.collect()
topTweetText = topTweets.map(lambda row : row.text)
print topTweetText.collect()
# Make a happy person row
happyPeopleRDD = sc.parallelize([Row(name="holden", favouriteBeverage="coffee")])
happyPeopleSchemaRDD = sqlCtx.inferSchema(happyPeopleRDD)
happyPeopleSchemaRDD.registerTempTable("happy_people")
# Make a UDF to tell us how long some text is
sqlCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType())
lengthSchemaRDD = sqlCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10")
print lengthSchemaRDD.collect()
sc.stop()
示例12: print
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
SELECT nombre, apellidos, nota
FROM Estudiantes
WHERE nota >= 8
ORDER BY apellidos ASC
""")
print("Notables: ")
print(notables.show())
## Funciones definidas por el usuario
def notatxt(nota):
if nota < 5:
return "suspenso"
if nota < 6.5:
return "aprobado"
if nota < 9:
return "notable"
if nota < 9.9:
return "excelente"
return "matrícula"
sqlCtx.registerFunction("notatxt",notatxt)
publicada = sqlCtx.sql("""
SELECT apellidos,nombre,notatxt(nota) AS Expediente
FROM Estudiantes
ORDER BY apellidos
""")
print("Notas txt: ")
print(publicada.show())
示例13: date
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
############# ############# ############# ############# #############
# filterData
# by JAG3
#
############# ############# ############# ############# #############
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import BooleanType
from datetime import date
import sys
import argparse
sys.path.insert(0, './lib/')
from to_parquet import csvToDataFrame
import fspLib
import shapeReader
# HARD CODE YOU INPUT DATA SETS AND DATA TYPES
DATA_SETS = {"/data/ingest/twitter/success/":2}
LOWER_TIME = date(2006,03,21)
UPPER_TIME = date(3000,01,01)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("shapeFile", help="The shape file path")
parser.add_argument("outputPath",help="Output destination")
parser.add_argument("-jobNm", help="Application name, default = 'Geqe Data Filter'",default='Geqe data filter.')
parser.add_argument("-cNum", type=int, help="Number of processes to coalesce initial input data to, default = 3",default = 8)
parser.add_argument("--stopWordsFile",help="File path to a stop words list. One word per line. default=inputFiles/stopWordList.txt",default="inputFiles/stopWordList.txt")
parser.add_argument("-sCustStop", help="Comma seperated list of stop words to add include on this run",default='')
args = parser.parse_args()
shapeFile = args.shapeFile