当前位置: 首页>>代码示例>>Python>>正文


Python SQLContext.registerFunction方法代码示例

本文整理汇总了Python中pyspark.sql.SQLContext.registerFunction方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.registerFunction方法的具体用法?Python SQLContext.registerFunction怎么用?Python SQLContext.registerFunction使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.SQLContext的用法示例。


在下文中一共展示了SQLContext.registerFunction方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
def main(sc):
    	path = "events"
    	#text_file = sc.textFile(path)
    	sqlContext = SQLContext(sc)
    	events = sqlContext.jsonFile(path)

	events = events.select(events["events.event"]).flatMap(lambda p: p.event)
	events = events.map(lambda p: Row(
		id=p.id,\
		title=p.title, \
		lat=p.latitude, \
		long=p.longitude, \
		postal_code=p.postal_code, \
		start_time=datetime.strptime(p.start_time, "%Y-%m-%d %H:%M:%S"), \
		stop_time=p.stop_time)) 	
	events_df = sqlContext.createDataFrame(events)
	
	events_df.registerTempTable("events")

	sqlContext.registerFunction("to_hour", lambda x: x.hour)
	sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day) + "-" + str(x.year))

	e = sqlContext.sql("select title, str_date(start_time) as event_date,
	to_hour(start_time) as hour, postal_code from events where postal_code is not null and start_time is not null")

	events_grouped = sqlContext.sql("select event_date, hour, postal_code, 
	count(*) from events_filtered group by event_date,hour,postal_code order by postal_code,hour")

	grouped_csv = events_grouped.map(toCSV)
	grouped_csv.saveAsTextFile('events_cluster')
开发者ID:Narasimman,项目名称:Most-hapennning-places-NYC,代码行数:32,代码来源:parse_events.py

示例2: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
def main(sc):
    sqlContext = SQLContext(sc)
    taxiFile = sc.textFile("taxizip/taxizipaa.csv")
    header = taxiFile.first()

    taxiHeader = taxiFile.filter(lambda l: "vendor_id" in l)
    taxiNoHeader = taxiFile.subtract(taxiHeader)

    taxi_temp = taxiNoHeader.map(lambda k: k.split(","))

    taxi_rdd = taxi_temp.map(lambda p: Row(vendor_id=p[0],
    pickup_datetime=datetime.strptime(p[1], "%Y-%m-%d %H:%M:%S"),
    dropoff_datetime=datetime.strptime(p[2], "%Y-%m-%d %H:%M:%S"),
    passenger_count=int(p[3] if p[3]!="" else 0),
    trip_distance=float(p[4] if p[4]!="" else 0),
    pickup_longitude=float(p[5] if p[5]!="" else 0) ,
    pickup_latitude=float(p[6] if p[6]!="" else 0),
    rate_code=p[7],
    store_and_fwd_flag=p[8],
    dropoff_longitude=float(p[9] if p[9]!="" else 0),
    dropoff_latitude=float(p[10] if p[10]!="" else 0),
    payment_type=p[11],
    fare_amount=float(p[12] if p[12]!="" else 0),
    surcharge=float(p[13] if p[13]!="" else 0),
    mta_tax=float(p[14] if p[14]!="" else 0),
    tip_amount=float(p[15] if p[15]!="" else 0),
    tolls_amount=float(p[16] if p[16]!="" else 0),
    total_amount=float(p[17] if p[17]!="" else 0),
    zipcode=p[18]))

    taxi_df = sqlContext.createDataFrame(taxi_rdd)

    taxi_df.registerTempTable("taxi")

    sqlContext.registerFunction("to_hour", lambda x: x.hour)
    sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day) + "-" + str(x.year))
 
    th = sqlContext.sql("SELECT to_hour(dropoff_datetime) as hour, dropoff_datetime as trip_date, dropoff_longitude as lng,dropoff_latitude as lat,zipcode FROM taxi where dropoff_longitude!=0 and dropoff_latitude!=0")

    th.registerTempTable("taxi_hr")
    sqlContext.cacheTable("taxi_hr")

    grouped_taxi = sqlContext.sql("select hour, zipcode,str_date(trip_date), count(*) as c from taxi_hr group by hour,zipcode,str_date(trip_date) order by c desc")
    grouped_taxi.show(100)
 
    #save this intermediate result to a file as csv
    grouped_csv = grouped_taxi.map(toCSV)
    grouped_csv.saveAsTextFile('results')

    grouped_taxi.registerTempTable("taxi_grouped")
    sqlContext.cacheTable("taxi_grouped")
开发者ID:Narasimman,项目名称:Most-hapennning-places-NYC,代码行数:53,代码来源:parse_taxi.py

示例3: SparkConf

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import DoubleType

from com.esri.udt import PointType, PointUDT

if __name__ == "__main__":

    conf = SparkConf().setAppName("GDB App")
    sc = SparkContext(conf=conf)
    try:
        sqlContext = SQLContext(sc)

        sqlContext.registerFunction("getX", lambda p: p.x, DoubleType())
        sqlContext.registerFunction("getY", lambda p: p.y, DoubleType())
        sqlContext.registerFunction("plus2", lambda p: PointType(p.x + 2, p.y + 2), PointUDT())

        points = "Points"
        df_points = sqlContext.read \
            .format("com.esri.gdb") \
            .options(path="../../test/resources/Test.gdb", name=points, numPartitions="1") \
            .load()
        df_points.printSchema()
        df_points.registerTempTable(points)
        rows = sqlContext.sql("select plus2(Shape),X,Y from {}".format(points))
        for row in rows.collect():
            print row

        lines = "Lines"
        df_lines = sqlContext.read \
            .format("com.esri.gdb") \
开发者ID:giserh,项目名称:spark-gdb,代码行数:33,代码来源:udtapp.py

示例4: str

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
    base_uri = "http://api.wunderground.com/api/{}/history_".format(weath_api_key)
    day = str(day)
    year = str(year)
    ds = day + " " + year
    x = datetime.datetime.strptime(ds, "%B %d %Y").date()
    date_formated = "".join([str(x.year), lead_zero(str(x.month)), lead_zero(str(x.day))])

    location_l = location.split(",")
    state = location_l[1].replace(" ", "")
    city = location_l[0].replace(" ", "_")

    ruri = base_uri + date_formated + "/q/" + state + "/" + city + ".json"
    return ruri

#register the function for SQL to use
sqlContext.registerFunction("weather_uri", weather_uri)


# In[32]:

query = """
select
distinct
year,
Day,
city_stadium_map.stadium_city,
weather_uri(city_stadium_map.stadium_city, year, Day) as weather_uri
from scores
inner join city_stadium_map on
 scores.stadium = city_stadium_map.stadium
limit 10
开发者ID:bradenrc,项目名称:nfl_weather,代码行数:33,代码来源:none_issue.py

示例5: Functions

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
result.show()

# User defined functions
# Spark SQL also provides the functionality similar to User Defined Functions (UDF) offering in Hive. 
# Spark uses registerFunction() function to register python functions in SQLContext.

# user defined function
def transform_review(review):
    x1 = re.sub('[^0-9a-zA-Z\s]+','',review)
    return [x1.lower()]

# register table from above
result.registerAsTable("result")

# register function from above
sqc.registerFunction("to_lowercase", lambda x:transform_review(x),returnType=ArrayType(StringType(), True))

# use the registered function inside SQL 
sql_query_transform = """SELECT asin, reviewText, to_lowercase(reviewText) as cleaned
            FROM result
"""

result_transform = sqc.sql(sql_query_transform)
result_transform.show()

# FINALLY,  Mix and Match!!

# You can also mix DataFrames, RDDs and SparkSQL to make it work for you. 

# Scenario:
# We want to investigate the average rating of reviews in terms of the categories they belong to. In order to do this, we:
开发者ID:FuzzyDuck79,项目名称:spark_tutorial,代码行数:33,代码来源:spark_tutorial.py

示例6: SparkContext

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
    sc = SparkContext(conf = conf)
    sqlContext = SQLContext(sc)


    #Create polygon list and broadcast variable based on it
    lPolygon = shapeReader.readInShapeJson(shapeFile)
    bc_lTargetPolygons = sc.broadcast(lPolygon)

    #Read in data, coalesce to limit the number of jobs and avoid shuffling issues later in the job

    records = sqlContext.parquetFile(inputFile) if 0 == nDataType else csvToDataFrame(sc,sqlContext,inputFile,nDataType)
    if inputPartitions != -1:
        records = records.repartition(inputPartitions)
    records.cache()
    records.registerTempTable('records')
    sqlContext.registerFunction("inRegionOfInterest", lambda lat,lon: fspLib.inROI(lat,lon,bc_lTargetPolygons),returnType=BooleanType())
    sqlContext.registerFunction("inEventOfInterest", lambda lat,lon,dt: fspLib.inEOI(lat,lon,dt,bc_lTargetPolygons),returnType=BooleanType())
    data = sqlContext.sql("SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon) AND inEventOfInterest(records.lat,records.lon,records.dt)")


    #Split data into 2 DDSs depending on being in our out of region of interest
    rows = data.collect()
    if not os.path.isdir('previewTrainingFiles'): os.mkdir('previewTrainingFiles')
    fOut = codecs.open('previewTrainingFiles/'+jobNm, encoding="utf-8",mode="wb")
    for row in rows:
        try:
            buffer =  [row.lat,row.lon,row.user,row.dt.date(),row.text,row.dt]
            buffer = map(lambda x: unicode(x).replace(u'\t',u' ').replace(u'\n',u' '),buffer)
            fOut.write(u'\t'.join(buffer)+u'\n')
        except:
            traceback.print_exc()
开发者ID:theseusyang,项目名称:GEQE,代码行数:33,代码来源:viewTrainingData.py

示例7: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
def main(sc):
    sqlContext = SQLContext(sc)
    taxiFile = sc.textFile("taxizip1.csv")
    header = taxiFile.first()

    fields = [StructField(field_name, StringType(), True) for field_name in header.split(',')]
    fields[1].dataType = TimestampType()
    fields[2].dataType = TimestampType()
    fields[3].dataType = IntegerType()
    fields[4].dataType = FloatType()
    fields[5].dataType = FloatType()
    fields[6].dataType = FloatType()
    fields[9].dataType = FloatType()
    fields[10].dataType = FloatType()
    fields[12].dataType = FloatType()
    fields[13].dataType = FloatType()
    fields[14].dataType = FloatType()
    fields[15].dataType = FloatType()
    fields[16].dataType = FloatType()
    fields[17].dataType = FloatType()

    schema = StructType(fields)
    taxiHeader = taxiFile.filter(lambda l: "vendor_id" in l)
    taxiNoHeader = taxiFile.subtract(taxiHeader)

    taxi_temp = taxiNoHeader.map(lambda k: k.split(","))

    taxi_rdd = taxi_temp.map(lambda p: (p[0],
    datetime.strptime(p[1], "%Y-%m-%d %H:%M:%S"),
    datetime.strptime(p[2], "%Y-%m-%d %H:%M:%S"),
    int(p[3] if p[3]!="" else 0),
    float(p[4] if p[4]!="" else 0),
    float(p[5] if p[5]!="" else 0) ,
    float(p[6] if p[6]!="" else 0),
    p[7],
    p[8],
    float(p[9] if p[9]!="" else 0),
    float(p[10] if p[10]!="" else 0),
    p[11],
    float(p[12] if p[12]!="" else 0),
    float(p[13] if p[13]!="" else 0),
    float(p[14] if p[14]!="" else 0),
    float(p[15] if p[15]!="" else 0),
    float(p[16] if p[16]!="" else 0),
    float(p[17] if p[17]!="" else 0),
    p[18] ))


    taxi_df = sqlContext.createDataFrame(taxi_rdd, schema)

    taxi_df.registerTempTable("taxi")

    sqlContext.registerFunction("to_hour", lambda x: x.hour)
    sqlContext.registerFunction("to_date", lambda x: x.date())
    sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day))
 
    th = sqlContext.sql("SELECT to_hour(dropoff_datetime) as hour, to_date(dropoff_datetime) as trip_date, dropoff_longitude as lng,dropoff_latitude as lat,zipcode FROM taxi where dropoff_longitude!=0 and dropoff_latitude!=0")

    th.registerTempTable("taxi_hr")

    #test_hr = sqlContext.sql("select hour, count(*) from taxi_hr group by hour,trip_date")
    test_hr = sqlContext.sql("select hour, zipcode,trip_date, count(*) as c from taxi_hr group by hour,zipcode,trip_date order by c desc")
开发者ID:Narasimman,项目名称:Most-hapennning-places-NYC,代码行数:64,代码来源:new_parse_taxi.py

示例8: StructField

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
                           StructField("TRIP_ID", StringType(), True),\
                           StructField("STOP_ID", StringType(), True),\
                           StructField("time",StringType(), True)])

    real_stoptimes = sqlContext.read.format('com.databricks.spark.csv').options(header='false')\
                                                                       .load('new_predict.csv', schema = real_stoptimes_schema)              
    stoptimes = sqlContext.read.format('com.databricks.spark.csv').options(header='false')\
                                                                  .load('stop_times.txt',schema = stop_times_schema)
    
    new_time = real_stoptimes.withColumn('realtime',split(pyspark.sql.functions.from_unixtime(real_stoptimes.time), ' ')[1])\
                             .withColumn('date',split(pyspark.sql.functions.from_unixtime(real_stoptimes.time), ' ')[0])
   
    new_time.registerTempTable('new_time')
    stoptimes.registerTempTable('stoptimes')

    sqlContext.registerFunction("getsec", lambda x: get_sec(x), IntegerType()) #register python function into sql

    join = sqlContext.sql('SELECT ROUTE_ID,TRIP_ID,STOP_ID,realtime,date,(getsec(realtime)-getsec(arrival_time)) as delay\
                           FROM new_time\
                           INNNER JOIN stoptimes\
                           ON (TRIP_ID = trip_id AND STOP_ID = stop_id)') # join with GTFS data

    join.registerTempTable('new_join')

    with open(sys.argv[-2]) as fr: #read sql                                                                                               
      query = fr.read()

	  sqlContext.sql(query)\
              .map(lambda x: ",".join(map(str, x)))\
              .saveAsTextFile(sys.argv[-1])
开发者ID:sarangof,项目名称:Bus-Capstone,代码行数:32,代码来源:on_time_ratio.py

示例9: SparkContext

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
    sc = SparkContext(master, app_name)
    sql_context = SQLContext(sc)

    lines = sc.textFile(input)
    parts = lines.map(lambda l: l.split(separator)).filter(lambda x: len(x) == 32)

    schema_string = "site_id,site_uuid,site_uuid_ctime,ptitle,url," \
                    "referrer,prevPID,attime,resolution,ip," \
                    "ctime,language,cookie_enabled,ua,uuid," \
                    "uuid_ctime,browser,os,tag_key,supp_id," \
                    "gw_id,portal_version,from_page,channel_id,channel_list_id," \
                    "content_id,advid,appid,spenttime,assingleaccess," \
                    "asfirstaccess,aslastaccess"
    fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(separator)]
    schema = StructType(fields)

    schema_rdd = sql_context.createDataFrame(parts, schema)
    schema_rdd.registerTempTable("sitepvv3")

    sql_context.registerFunction("to_date", lambda x:Fun().to_date(x), DateType())
    sql_context.registerFunction("datediff",lambda x,k:Fun().datediff(x,k),IntegerType())

    sql_context.registerFunction("hour",lambda x:Fun().to_hour(x),IntegerType())
    sql_context.registerFunction("str_conver_int",lambda x:int(x),IntegerType())

    begin_time = dest_time_str
    end_time = dest_time_str
    Sitepvv3Service().exec_file(sql_context,begin_time,end_time)

    sc.stop()
开发者ID:wangcunxin,项目名称:spark_py,代码行数:32,代码来源:main.py

示例10: sum

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
                           sum(duration) as total_duration\
                from       interactions\
                group by   protocol_type").show()


sqlContext.sql("select     protocol_type,\
                           count(*) as cnt,\
                           avg(duration) as total_duration\
                  from     interactions\
                group by   protocol_type").show()

#udf in DF
label_fun = udf(lambda x: "normal" if x == "normal." else "attack", StringType())

#udf in sqlContext
sqlContext.registerFunction('strLength', lambda x: len(x))

#add attack column
df2 = interactions_labeled_df.withColumn('attack',label_fun(interactions_labeled_df['label']))
df2.groupBy("attack").count().show()

interactions_labeled_df.registerTempTable("interactions_label")

sqlContext.sql("select     label,\
                           case when label = 'normal.'\
                                then 'normal'\
                                else 'attack'\
                                end as attack\
                  from     interactions_label")\
              .groupBy('attack')\
              .count()\
开发者ID:wlsherica,项目名称:HadoopCon_2015_SparkSQL,代码行数:33,代码来源:SparkSQL_training.py

示例11: SparkConf

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
# A simple demo for working with SparkSQL and Tweets
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
import json
import sys

if __name__ == "__main__":
    inputFile = sys.argv[1]
    conf = SparkConf().setAppName("SparkSQLTwitter")
    sc = SparkContext()
    sqlCtx = SQLContext(sc)
    print "Loading tweets from " + inputFile
    input = sqlCtx.jsonFile(inputFile)
    input.registerTempTable("tweets")
    topTweets = sqlCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
    print topTweets.collect()
    topTweetText = topTweets.map(lambda row : row.text)
    print topTweetText.collect()
    # Make a happy person row
    happyPeopleRDD = sc.parallelize([Row(name="holden", favouriteBeverage="coffee")])
    happyPeopleSchemaRDD = sqlCtx.inferSchema(happyPeopleRDD)
    happyPeopleSchemaRDD.registerTempTable("happy_people")
    # Make a UDF to tell us how long some text is
    sqlCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType())
    lengthSchemaRDD = sqlCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10")
    print lengthSchemaRDD.collect()
    sc.stop()
开发者ID:AI-Org,项目名称:learning-spark,代码行数:29,代码来源:SparkSQLTwitter.py

示例12: print

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
		SELECT nombre, apellidos, nota
		FROM Estudiantes
		WHERE nota >= 8
		ORDER BY apellidos ASC
	""")

print("Notables: ")
print(notables.show())

## Funciones definidas por el usuario
def notatxt(nota):
	if nota < 5:
		return "suspenso"
	if nota < 6.5:
		return "aprobado"
	if nota < 9:
		return "notable"
	if nota < 9.9:
		return "excelente"
	return "matrícula"

sqlCtx.registerFunction("notatxt",notatxt)
publicada = sqlCtx.sql("""
		SELECT apellidos,nombre,notatxt(nota) AS Expediente
		FROM Estudiantes
		ORDER BY apellidos
	""")

print("Notas txt: ")
print(publicada.show())
开发者ID:andrs,项目名称:libro,代码行数:32,代码来源:consultasSql.py

示例13: date

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerFunction [as 别名]
############# ############# ############# ############# #############
# filterData
# by JAG3
#
############# ############# ############# ############# #############
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import BooleanType
from datetime import date
import sys
import argparse
sys.path.insert(0, './lib/')
from to_parquet import csvToDataFrame
import fspLib
import shapeReader
# HARD CODE YOU INPUT DATA SETS AND DATA TYPES
DATA_SETS = {"/data/ingest/twitter/success/":2}

LOWER_TIME = date(2006,03,21)
UPPER_TIME = date(3000,01,01)
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("shapeFile", help="The shape file path")
    parser.add_argument("outputPath",help="Output destination")
    parser.add_argument("-jobNm", help="Application name, default = 'Geqe Data Filter'",default='Geqe data filter.')
    parser.add_argument("-cNum", type=int, help="Number of processes to coalesce initial input data to, default = 3",default = 8)
    parser.add_argument("--stopWordsFile",help="File path to a stop words list. One word per line. default=inputFiles/stopWordList.txt",default="inputFiles/stopWordList.txt")
    parser.add_argument("-sCustStop", help="Comma seperated list of stop words to add include on this run",default='')
    args = parser.parse_args()
    shapeFile = args.shapeFile
开发者ID:theseusyang,项目名称:GEQE,代码行数:32,代码来源:filterData.py


注:本文中的pyspark.sql.SQLContext.registerFunction方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。