Python SQLContext.inferSchema方法代碼示例

本文整理匯總了Python中pyspark.SQLContext.inferSchema方法的典型用法代碼示例。如果您正苦於以下問題：Python SQLContext.inferSchema方法的具體用法？Python SQLContext.inferSchema怎麽用？Python SQLContext.inferSchema使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyspark.SQLContext的用法示例。

在下文中一共展示了SQLContext.inferSchema方法的1個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: main

# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import inferSchema [as 別名]
def main():
    conf = SparkConf().setAppName('housingprice')
    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)
    taxreportSchema = StructType([
        StructField('PID', StringType(), False),
        StructField('Legal_Type', StringType(), False),
        StructField('FOLIO', StringType(), False),
        StructField('Coordinates', StringType(), True),
        StructField('ZoneName', StringType(), True),
        StructField('ZoneCat', StringType(), True),
        StructField('LOT', StringType(), True),
        StructField('Block', StringType(), True),
        StructField('plan', StringType(), True),
        StructField('DisLot', StringType(), True),
        StructField('FCiviNum', StringType(), True),
        StructField('TCiviNum', StringType(), True),
        StructField('StreetName', StringType(), True),
        StructField('PostalCode', StringType(), True),
        StructField('NLegalName1', StringType(), True),
        StructField('NLegalName2', StringType(), True),
        StructField('NLegalName3', StringType(), True),
        StructField('NLegalName4', StringType(), True),
        StructField('NLegalName5', StringType(), True),
        StructField('CurVal', StringType(), True),
        StructField('CurImpVal', StringType(), True),
        StructField('Taxassess', StringType(), True),
        StructField('prevVal', StringType(), True),
        StructField('prevImpVal', StringType(), True),
        StructField('YearBuilt', StringType(), True),
        StructField('BigImpYear', StringType(), True),
        StructField('Tax_levy', StringType(), True),
        StructField('NeighbourhoodCode', StringType(), True),
    ])
    conversionSchema = StructType([
        StructField('date', StringType(), False),
        StructField('USD', StringType(), False),
        StructField('rate', StringType(), False),
        StructField('reciprate', StringType(), False),
    ])
    crudeoilSchema = StructType([
        StructField('date', DateType(), False),
        StructField('oilprice', StringType(), False),
    ])
    def fixdate(convVal):
        a = convVal.split(" ")
        dates = a[0].split("/")
        alldate = "20"+dates[2]+'/'+dates[0]
        return (alldate,a[1])
    def filterYear(dates):
        a = dates.split('/')
        if (a[1]=='2016'):
            return False
        else:
            return True
    def processDate(df):
        def splitMonth(cols):
         a = cols.split('/')
         return a[1]

        def splitYear(cols):
         a = cols.split('/')
         return a[0]

        fUDF = udf(splitMonth, StringType())
        df1 =  df.withColumn("month", fUDF('year'))
        fUDFyear = udf(splitYear, StringType())
        return df1.withColumn("year", fUDFyear('year'))
    #Reading the Tax Report Dataset
    taxreportinfo = sqlContext.read.format('com.databricks.spark.csv').options(header='true').schema(taxreportSchema).load(inputs+"taxreport/test")
    taxreportinfo.registerTempTable("taxreport")
    #Selecting the price,TaxAssessment Year and Postalcode of each property
    propertyVal = sqlContext.sql("SELECT CurVal, Taxassess, PostalCode FROM taxreport")
    propertyVal.registerTempTable("propertyVal")
    #Reading the CAN to USD conversion dataset
    conversion = sqlContext.read.format('com.databricks.spark.csv').options(header='true').schema(conversionSchema).load(inputs+"conversion")
    conversion.registerTempTable("Conversion")
    #Selecting only the date and rate
    conversionrate = sqlContext.sql("SELECT date,rate FROM Conversion WHERE rate regexp '^[0-9]+'")
    conversionRDD = conversionrate.repartition(40).rdd.map(lambda w: (w.date+" "+w.rate))
    conversiondates = conversionRDD.map(fixdate).filter(lambda (w,x):filterYear(w)).map(lambda l: Row(date=l[0], rate=l[1]))
    schemaConv = sqlContext.inferSchema(conversiondates)
    schemaConv.registerTempTable("ConversionDate")
    ConverDF = sqlContext.sql(" SELECT date,CAST(AVG(rate) AS DECIMAL(4,2)) as conversionrate FROM ConversionDate WHERE rate IS NOT NULL GROUP BY date")
    ConverDF.cache()
    #Reading the Canada Crude oil price dataset
    crudeoil = sc.textFile(inputs+"crudeoil")
    crudeoilRDD = crudeoil.map(lambda l: l.split()).map(lambda l: Row(date=l[0], oilprice=l[1]))
    crudeoilDF = sqlContext.inferSchema(crudeoilRDD)
    crudeoilDF.registerTempTable("crudeoil")
    #Selecting the date on M/Y format and oilprice
    oilprice = sqlContext.sql("SELECT DATE_FORMAT(date,'Y/M') as date,oilprice FROM crudeoil")
    oilprice.registerTempTable('oilprice')
    #Reading the interestrate of BC Dataset
    interestRate = sqlContext.read.format('com.databricks.spark.csv').options(header='true').load(inputs+"interestrate")
    interestRate.registerTempTable("interest")
    #Selecting the date and 5-year fixed mortgage price from the dataset
    interestDF = sqlContext.sql("SELECT DATE_FORMAT(date,'Y/M') as date,CAST(`5y-fixed-posted` AS DECIMAL(4,2)) AS interestrate FROM interest WHERE date >='2006-01' AND date <= '2015-12'")
    interestDF.registerTempTable("allrates")
#.........這裏部分代碼省略.........

開發者ID:sasoltan，項目名稱:VancouverHousingPrice，代碼行數:103，代碼來源:getproperty.py

注：本文中的pyspark.SQLContext.inferSchema方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。