当前位置: 首页>>代码示例>>Python>>正文


Python SQLContext.inferSchema方法代码示例

本文整理汇总了Python中pyspark.sql.SQLContext.inferSchema方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.inferSchema方法的具体用法?Python SQLContext.inferSchema怎么用?Python SQLContext.inferSchema使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.SQLContext的用法示例。


在下文中一共展示了SQLContext.inferSchema方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
def main(sc):

        sqlContext = SQLContext(sc)
        tasteProfileRdd = sc.textFile("userTaste/*")
	songRdd = sc.textFile("songsDict/*")
        # Load a text file and convert each line to a Row.
        tasteProfile = tasteProfileRdd.filter(lambda l:len(l) > 0)
        parsedSplits = tasteProfile.map(lambda l: l.split('\t'))
        userTaste = parsedSplits.map(lambda p: Row(userId=p[0], songId=p[1], playCount=p[2]))

	individualSong = songRdd.map(lambda l:l.split('|'))
        songData = individualSong.map(lambda s: Row(songId=s[0],featureSet=s[1]))

        # Infer the schema, and register the DataFrame as a table.
        schemaUserTaste = sqlContext.inferSchema(userTaste)
        schemaUserTaste.registerTempTable("userTaste")

	schemaSongData = sqlContext.inferSchema(songData)
        schemaSongData.registerTempTable("songData")

	test2 = sqlContext.sql("select * from songData limit 5")
	songIds = test2.map(lambda p: "songIds: " + s.songId)
        #test1 = sqlContext.sql("SELECT distinct * FROM userTaste limit 5")

        #songIds = test1.map(lambda p: "songIds: " + p.songId)
        for i in songIds.collect():
               print i
开发者ID:himaja20,项目名称:MusicRecommenderSystem,代码行数:29,代码来源:userPlayCountsSpark.py

示例2: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
def main():
    sqlContext = SQLContext(sc)
    data = sc.textFile("hdfs://spark1:9000/user/convert_out/ct_20110218.csv", 200).map(lambda line: line.split(","))
    rows = data.filter(lambda x: x[0] != 'SYMBOL')
    df = rows.map(lambda p: (p[0].strip(), transform_time(p[1].strip(), p[2].strip()), float(p[3].strip()), float(p[4].strip())))

    symbols = df.map(lambda x: Row(symbol=x[0], time=x[1], price=x[2], volume=x[3]))
    schemaSymbols = sqlContext.inferSchema(symbols)
    schemaSymbols.registerTempTable("symbols")

    trades = sqlContext.sql("""SELECT symbol, time, sum(price*volume)/sum(volume) as price, sum(volume) as volume from
            symbols group by symbol, time""")
    trades = trades.map(lambda x: Row(symbol=x[0], time=x[1], price=x[2], volume=x[3]))
    schemaTrades = sqlContext.inferSchema(trades)
    schemaTrades.registerTempTable("trades")

    # remove limit after test
    syms = sqlContext.sql("SELECT distinct symbol from trades")
    syms = syms.collect()

    df_dict = {}
    print type(syms)
    for sym in syms:
        sym = sym.symbol.strip()
        print sym
        sym_data = sqlContext.sql("SELECT symbol, time, price, volume FROM trades WHERE symbol = '{}' ORDER BY symbol, time".format(sym))

        sym_data = sym_data.collect()
        print len(sym_data)
        sym_df = pd.DataFrame(sym_data, columns=['symbol', 'time', 'price', 'volume'])

        # Predictive model did not like original volume values, so use rescaled value
        sym_df['volume10k'] = np.round(sym_df['volume'] / 10000, 3)

        for i in range(1,11):
            sym_df['price_t-'+str(i)] = sym_df['price'].shift(i)

        for i in range(1,11):
            #sym_df['volume_t-'+str(i)] = sym_df['volume'].shift(i)
            sym_df['volume10k_t-'+str(i)] = sym_df['volume10k'].shift(i)

        # add labels for price and volume
        sym_df['price_label'] = sym_df['price'].shift(-1)
        sym_df['volume_label'] = sym_df['volume'].shift(-1)

        sym_df['price_dir_label'] = np.where(sym_df.price_label > sym_df.price, 1, 0)
        sym_df['volume_dir_label'] = np.where(sym_df.volume_label > sym_df.volume, 1, 0)

        sym_df = sym_df.dropna()
        df_dict[sym] = sym_df
        print sym_df

        train(sym,sym_df)

    # print for testing
    print len(df_dict)
    print df_dict.keys()
    print type(df_dict[sym])
开发者ID:redame,项目名称:quote_streaming,代码行数:60,代码来源:build_and_train.py

示例3: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
def main():
    conf = SparkConf().setAppName('symbols').set("spark.storage.blockManagerSlaveTimeoutMs", 60000)
    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)
    data = sc.textFile("hdfs://spark1:9000/user/convert_out/ct_20110218.csv", 200).map(lambda line: line.split(",")).cache()
    rows = data.filter(lambda x: x[0] != 'SYMBOL')
    df = rows.map(lambda p: (p[0].strip(), transform_time(p[1].strip(), p[2].strip()), float(p[3].strip()), float(p[4].strip()))) 
    #df = df.filter(lambda x: x[1] != 0)    

    symbols = df.map(lambda x: Row(symbol=x[0], time=x[1], price=x[2], volume=x[3]))
    schemaSymbols = sqlContext.inferSchema(symbols)
    schemaSymbols.registerTempTable("symbols")
    
    trades = sqlContext.sql("""SELECT symbol, time, sum(price*volume)/sum(volume) as price, sum(volume) as volume from
            symbols group by symbol, time""")
    trades = trades.map(lambda x: Row(symbol=x[0], time=x[1], price=x[2], volume=x[3]))
    schemaTrades = sqlContext.inferSchema(trades)
    schemaTrades.registerTempTable("trades")

    # remove limit after test
    syms = sqlContext.sql("SELECT distinct symbol from trades")
    syms = syms.collect()
    
    df_dict = {}
    print type(syms)
    for sym in syms:
        sym = sym.symbol.strip()
        print sym
        sym_data = sqlContext.sql("SELECT symbol, time, price, volume FROM trades WHERE symbol = '{}' ORDER BY symbol, time".format(sym))
        
        sym_data = sym_data.collect()
        print len(sym_data)
        sym_df = pd.DataFrame(sym_data, columns=['symbol', 'time', 'price', 'volume'])
        for i in range(1,11):
            sym_df['price_t-'+str(i)] = sym_df['price'].shift(i)

        for i in range(1,11):
            sym_df['volume_t-'+str(i)] = sym_df['volume'].shift(i)

        # add labels for price and volume
        sym_df['price_label'] = sym_df['price'].shift(-1)
        sym_df['volume_label'] = sym_df['volume'].shift(-1)
        
        sym_df['price_label'] = np.where(sym_df.price_label > sym_df.price, 1, 0)
        sym_df['volume_label'] = np.where(sym_df.volume_label > sym_df.volume, 1, 0)


        sym_df = sym_df.dropna()
        df_dict[sym] = sym_df
        print sym_df

    # print for testing
    print len(df_dict)
    print df_dict.keys()
    print type(df_dict[sym])
    sc.stop()
开发者ID:redame,项目名称:quote_streaming,代码行数:59,代码来源:spark_model.py

示例4: test_infer_schema

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
 def test_infer_schema(self):
     sqlCtx = SQLContext(self.sc)
     rdd = self.sc.parallelize([LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)])
     srdd = sqlCtx.inferSchema(rdd)
     schema = srdd.schema()
     field = [f for f in schema.fields if f.name == "features"][0]
     self.assertEqual(field.dataType, self.udt)
     vectors = srdd.map(lambda p: p.features).collect()
     self.assertEqual(len(vectors), 2)
     for v in vectors:
         if isinstance(v, SparseVector):
             self.assertEqual(v, self.sv1)
         elif isinstance(v, DenseVector):
             self.assertEqual(v, self.dv1)
         else:
             raise ValueError("expecting a vector but got %r of type %r" % (v, type(v)))
开发者ID:greatyan,项目名称:spark,代码行数:18,代码来源:tests.py

示例5: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
def main(sc):

        sqlContext = SQLContext(sc)
        songRdd = sc.textFile("processedSongData/")

        # Load a text file and convert each line to a Row.
        individualSong = songRdd.map(lambda l:l.split('\t'))
        songData = individualSong.map(lambda p: Row(trackId=p[0], loudness=(float(p[1]) if p[1] != u'' else 0), songId=p[36], title=p[43], pitches=p[32], timbre=p[33]))

        # Infer the schema, and register the DataFrame as a table.
        schemaSongData = sqlContext.inferSchema(songData)
        schemaSongData.registerTempTable("songData")

        #test1 = sqlContext.sql("SELECT * FROM userTaste WHERE playCount >= 5 AND playCount <= 10")
        test1 = sqlContext.sql("SELECT * FROM songData WHERE songId = ''")

        songIds = test1.map(lambda p: "songIds: " + p.songId)
        for i in songIds.collect():
                print i
开发者ID:himaja20,项目名称:MusicRecommenderSystem,代码行数:21,代码来源:parseSongData.py

示例6: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
def main(sc):

        sqlContext = SQLContext(sc)
        tasteProfileRdd = sc.textFile("insertsDir/*")

        # Load a text file and convert each line to a Row.
        tasteProfile = tasteProfileRdd.filter(lambda l:len(l) > 0)
        parsedSplits = tasteProfile.map(lambda l: l.split('|'))
        userTaste = parsedSplits.map(lambda p: Row(songId=p[0], songName=p[1], artistName=p[2], playCount=p[3], lastModified=p[4], dateAdded=p[5], artistId=p[6], foreignId=p[7], catalogId=p[8]))
	

	
        # Infer the schema, and register the DataFrame as a table.
        schemaUserTaste = sqlContext.inferSchema(userTaste)
        schemaUserTaste.registerTempTable("userTaste")

	
	
        test1 = sqlContext.sql("SELECT * FROM userTaste WHERE songId = 'None'")

        songIds = test1.map(lambda p: "songIds: " + p.songId)
        for i in songIds.collect():
                print i
开发者ID:himaja20,项目名称:MusicRecommenderSystem,代码行数:25,代码来源:parseUserTaste.py

示例7: SparkConf

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row

srcPath="/app/ecom/fc-star/wanggongzheng/wordtxt"
resPath="/app/ecom/fc-star/wanggongzheng/wordres"
appName="word count test"

#init sqlContext
conf = SparkConf().setAppName(appName)   
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

#create rdd from hadoop txt file
textRdd = sc.textFile(srcPath)
#map text word line to (word,1)
wordSplit = textRdd.flatMap(lambda line: line.split()).map(lambda word: (word, 1))
#reducebyKey to get word count
wordCounts = wordSplit.reduceByKey(lambda a, b: a+b)

#create row based rdd
rowRdd = wordCounts.map(lambda x: Row(word=x[0],wc=x[1]))
wordFrames = sqlContext.inferSchema(rowRdd)
wordFrames.registerTempTable("tword")
top10Frames = sqlContext.sql("select word,wc FROM tword order by wc desc limit 10")
print top10Frames.collect()
开发者ID:neowgz,项目名称:jobcode,代码行数:27,代码来源:wordcount_sql.py

示例8: len

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
assert len(sys.argv) == 2

from pyspark import SparkContext
sc = SparkContext(appName="BDB")

# sc is an existing SparkContext.
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

# Load a text file and convert each line to a dictionary.
lines = sc.textFile("../../../../bdb/rankings/rankings.txt")

parts = lines.map(lambda l: l.split(","))
rankings = parts.map(lambda p: {"pageURL": p[0], "pageRank": int(p[1]), "avgDuration":int(p[2])})

schemaRanking = sqlContext.inferSchema(rankings)
schemaRanking.registerAsTable("rankings")

print "Running query: ", sys.argv[1]
num_query = int(sys.argv[1])

print "Num_query: " + str(num_query) + "\n"

if num_query == 1:
    urls = sqlContext.sql("SELECT pageURL, pageRank FROM rankings WHERE pageRank > 10")
elif num_query == 2:
    urls = sqlContext.sql("SELECT pageURL, pageRank FROM rankings WHERE pageRank > 100")
elif num_query == 3:
    urls = sqlContext.sql("SELECT pageURL, pageRank FROM rankings WHERE pageRank > 1000")
elif num_query == 4:
    urls = sqlContext.sql("SELECT pageURL, pageRank FROM rankings WHERE pageRank > 1000")
开发者ID:jcarreira,项目名称:firebox,代码行数:33,代码来源:sql_query1.py

示例9: SparkContext

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
import sys
from operator import add
from pyspark import SparkContext
from datetime import datetime, timedelta
import datetime
from pyspark.sql import SQLContext, Row

#test comment
#test comment1
#test comment2
sc = SparkContext(appName="claimCount")
sqlContext = SQLContext(sc)
lines = sc.textFile("/data/claim_data/claimdata.txt")
parts = lines.map(lambda l: l.split("|"))
claims = parts.map(lambda p: Row(enroll_id = p[2],claim_id =p[1], place_of_service_cd=p[12],diag_cd =p[75],diag_position = p[73],procedure_cd=p[47]))
schemaPeople = sqlContext.inferSchema(claimdata)
schemaPeople.registerTempTable("people")
members = sqlContext.sql("SELECT enroll_id FROM claims WHERE place_of_service_cd ='23' AND diag_cd = '4423' and diag_position='1' and procedure_cd= 'A9579'")
enroll_ids = members.map(lambda p: p.enroll_id)
for enroll_id in enroll_ids.collect():
  print enroll_id
开发者ID:pkumkum,项目名称:myworks,代码行数:23,代码来源:claimquery.py

示例10: SparkContext

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
# Create a Spark context and set it to work
with SparkContext(conf=conf) as sc:

    # Read the parsed records
    directory = "hdfs:///user/{0}/data/tvlogs/".format( sc.sparkUser() )
    logs = sc.textFile( "{0}{1}".format(directory,name) ).map( lambda x: x.split(',') )

    # Turn each record into a Row object
    logRows = logs.map( lambda p: Row( user=p[0], date=p[1], type=p[4], content=p[5]) )

    # Create an SQL context
    sqlContext = SQLContext(sc)

    # Infer the schema. It will do so by looking at the first 100 rows
    logSchema = sqlContext.inferSchema( logRows )

    # Register the SchemaRDD as a table.
    logSchema.registerTempTable( "logs" )
    # We can now perform SQL queries over the "logs" table

    # An example query: find out who connected on Aug 8th, when they
    # first connected, and how many events we've got on that day for each one of those users
    data = sqlContext.sql("""
       SELECT user, min(date) AS start, count(*) as num 
          FROM logs 
          WHERE date > '2014-08-01' AND date < '2014-08-02' 
          GROUP by user ORDER BY start""" )

    # The results of SQL queries are RDDs and support all the normal RDD operations.
开发者ID:calba,项目名称:spark-dojo,代码行数:31,代码来源:tvlogs4.py

示例11: SparkContext

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row, StructField, StructType, StringType, IntegerType


if __name__ == "__main__":
    sc = SparkContext(appName="PythonSQL")
    sqlContext = SQLContext(sc)

    # RDD is created from a list of rows
    some_rdd = sc.parallelize([Row(name="John", age=19),
                              Row(name="Smith", age=23),
                              Row(name="Sarah", age=18)])
    # Infer schema from the first row, create a DataFrame and print the schema
    some_df = sqlContext.inferSchema(some_rdd)
    some_df.printSchema()

    # Another RDD is created from a list of tuples
    another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)])
    # Schema with two fields - person_name and person_age
    schema = StructType([StructField("person_name", StringType(), False),
                        StructField("person_age", IntegerType(), False)])
    # Create a DataFrame by applying the schema to the RDD and print the schema
    another_df = sqlContext.applySchema(another_rdd, schema)
    another_df.printSchema()
    # root
    #  |-- age: integer (nullable = true)
    #  |-- name: string (nullable = true)

    # A JSON dataset is pointed to by path.
开发者ID:MLDL,项目名称:spark,代码行数:32,代码来源:sql.py

示例12: f1

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
lines = sc.textFile("/data/train/monitorData/part-00001")
def f1(x):
    if x=='IMP':
        return 1
    else:
        return 0    
def f2(x):
    if x=='CLK':
        return 1
    else:
        return 0
parts = lines.map(lambda l: l.split("^"))
#convert each line to a dictionary
record = parts.map(lambda p: Row(user=p[0],isValid=p[1], date=p[-2][0:8],IMP=int(f1(p[-1])),CLK=int(f2(p[-1]))))
# Infer the schema, and register the SchemaRDD as a table.
schemaRecord = sqlContext.inferSchema(record)
schemaRecord.registerTempTable("train_record")
# SQL can be run over SchemaRDDs that have been registered as a table.
trainData=sqlContext.sql("SELECT user,date,sum(IMP) as SUM_IMP, sum(CLK) as SUM_CLK FROM train_record where isValid='1' group by user,date ")
line1 = sc.textFile("/data/train/transformData/*")
part1 = line1.map(lambda l: l.split("^"))
label = part1.map(lambda p: Row(user=p[0],date=p[1][0:8])) # get train label
schemaLabel = sqlContext.inferSchema(label)
schemaLabel.registerTempTable("train_label")
trainData.registerTempTable("trainData")#noted:before use trainData to left outer join train_label,the trainData must be registered to a table
trainData1=sqlContext.sql("select a.user,a.date as date,SUM_IMP,SUM_CLK,b.date as label from trainData a left outer join train_label  b on (a.user=b.user and a.date=b.date)")
def f(x):
    if x=='None':
        return 0
    else:
        return 1
开发者ID:lasclocker,项目名称:Advertising-purchasing-behavior-prediction,代码行数:33,代码来源:train_sql.py

示例13: sync_table

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
ad_camp_bid = sqlContext.jsonFile(path1)

path2 = "hdfs://ec2-52-8-165-110.us-west-1.compute.amazonaws.com:9000/user/AdReport/ads_info/history"
ad_camp_stats = sqlContext.jsonFile(path2)

#ad_camp_bid.printSchema()
#ad_camp_stats.printSchema()

sync_table(cost_table)

# Register this SchemaRDD as a table.
ad_camp_bid.registerTempTable("ad_camp_bid")
ad_camp_stats.registerTempTable("ad_camp_stats")

# SQL statements can be run by using the sql methods provided by sqlContext.
#info1 = sqlContext.sql("SELECT account_id, bid_type, targeting.age_max, targeting.age_min  FROM ad_camp_bid").collect()
#info2 = sqlContext.sql("SELECT campaign_id,actions_per_impression,clicks,cost_per_unique_click,date_start,date_stop FROM ad_camp_stats").collect()
info3 = sqlContext.sql("SELECT ad_camp_bid.account_id, ad_camp_bid.id, ad_camp_bid.bid_type, ad_camp_stats.clicks, ad_camp_stats.cost_per_unique_click,ad_camp_stats.cost_per_result,ad_camp_stats.cost_per_total_action,ad_camp_stats.result_rate,ad_camp_stats.date_start FROM ad_camp_bid INNER JOIN ad_camp_stats ON ad_camp_bid.id = ad_camp_stats.campaign_id and ad_camp_bid.bid_type IS NOT NULL").collect()
#info4 = sqlContext.sql("SELECT bid_type, COUNT(bid_type) FROM ad_camp_bid GROUP BY bid_type").collect()

del info3[-1]
infoRDD = sqlContext.inferSchema(info3)
infoRDD.registerTempTable("my_table")
q = sqlContext.sql("SELECT bid_type, AVG(cost_per_result) AS avg_cost_per_result, AVG(cost_per_total_action) AS avg_cost_per_action FROM my_table GROUP BY bid_type").collect() 

#infodf= pd.DataFrame(q)
#print (infodf.head(10))
for info in q:
        cost_table.create(bid_type=info[0], cost_per_result=info[1], cost_per_action=info[2])
print ("finished creating table: cost table")
开发者ID:prarthanabhattarai,项目名称:AdReportProject,代码行数:32,代码来源:make_charts.py

示例14: open

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
    t1 = time.time()
    dArrPos = {}
    nFeatures = 0
    revLookup = []
    fDict = open(dictFile,"r")
    for line in fDict:
        terms = line.split("\t")
        dArrPos[terms[0]]=(int(terms[1]), int(terms[2]))
        revLookup.append(terms[0])
    bc_dIDF = sc.broadcast(dArrPos)
    nVecLen = len(revLookup)
    t2 = time.time()
    diff = t2-t1
    print "Time to perform idf calc: ", diff

     # Create aggregate vectors from in region data
    t1 = time.time()
    grouped = aggregatedComparison.createAggregatedLabledPoint(records, bUseDate, fBinSize, bc_dIDF, True, bc_lStopWords, nGoodTweets, 1., nMinClusterUnique)\
        .map(lambda records: aggregatedComparison.mapForPrecomp(records, bUseDate, fBinSize))\
        .cache()
    nTotal = grouped.count()
    df = sqlContext.inferSchema(grouped)
    t2 = time.time()
    print nTotal, "entries for dataset"
    diff = t2-t1
    print "Time to get data ready for model by time", diff

    df.repartition(outPart).saveAsParquetFile(outputPath)

    print "<-----BOOM------->"
开发者ID:theseusyang,项目名称:GEQE,代码行数:32,代码来源:precomputePoints.py

示例15: SparkContext

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
                else:
                    assert False, "Invalid Query Parameter - {}".format(pair)
        else:
            assert False, "Unhandled Options - %s" %o

    sc = SparkContext(appName="TA Query")
    sqlContext = SQLContext(sc)

    #load the target dataset
    targetFile = "hdfs://cdh4-n.migosoft.com/user/athena/ta/{caldate}".format(caldate=CalDate)
    textFile = sc.textFile(targetFile)

    data = textFile.map(lambda l: l.split("\t"))
    ta_raw = data.map(lambda p: Row(Store_id=p[0], CalDate=p[1], PeriodType=p[2], member_id=p[3], Value1=p[4], Value2=p[5], Value3=p[6]))

    schemaTag= sqlContext.inferSchema(ta_raw)
    schemaTag.registerTempTable("ta_raw")

    rules = " and ".join(["{} = '{}'".format(k, v) for k, v in rule_dict.items() if v or v != "NULL"])
    output_path = "_".join(["{}".format(v) for k, v in rule_dict.items() if v or v != "NULL"])

    cmd = "SELECT member_id FROM ta_raw WHERE {} LIMIT 12".format(rules)
    query_info = sqlContext.sql(cmd)

    output = query_info.map(lambda p: p.member_id)
    rule_dict["Member"] = output.collect()

    output_File = "/home/erica_li/tmp/{}.txt".format(output_path)
    if os.path.exists(output_File):
        os.remove(output_File)
        print "remove folder already"
开发者ID:wuenhouse,项目名称:Tools_Setting,代码行数:33,代码来源:ta_query.py


注:本文中的pyspark.sql.SQLContext.inferSchema方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。