本文整理汇总了Python中pyspark.sql.SQLContext.inferSchema方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.inferSchema方法的具体用法?Python SQLContext.inferSchema怎么用?Python SQLContext.inferSchema使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.SQLContext
的用法示例。
在下文中一共展示了SQLContext.inferSchema方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
def main(sc):
sqlContext = SQLContext(sc)
tasteProfileRdd = sc.textFile("userTaste/*")
songRdd = sc.textFile("songsDict/*")
# Load a text file and convert each line to a Row.
tasteProfile = tasteProfileRdd.filter(lambda l:len(l) > 0)
parsedSplits = tasteProfile.map(lambda l: l.split('\t'))
userTaste = parsedSplits.map(lambda p: Row(userId=p[0], songId=p[1], playCount=p[2]))
individualSong = songRdd.map(lambda l:l.split('|'))
songData = individualSong.map(lambda s: Row(songId=s[0],featureSet=s[1]))
# Infer the schema, and register the DataFrame as a table.
schemaUserTaste = sqlContext.inferSchema(userTaste)
schemaUserTaste.registerTempTable("userTaste")
schemaSongData = sqlContext.inferSchema(songData)
schemaSongData.registerTempTable("songData")
test2 = sqlContext.sql("select * from songData limit 5")
songIds = test2.map(lambda p: "songIds: " + s.songId)
#test1 = sqlContext.sql("SELECT distinct * FROM userTaste limit 5")
#songIds = test1.map(lambda p: "songIds: " + p.songId)
for i in songIds.collect():
print i
示例2: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
def main():
sqlContext = SQLContext(sc)
data = sc.textFile("hdfs://spark1:9000/user/convert_out/ct_20110218.csv", 200).map(lambda line: line.split(","))
rows = data.filter(lambda x: x[0] != 'SYMBOL')
df = rows.map(lambda p: (p[0].strip(), transform_time(p[1].strip(), p[2].strip()), float(p[3].strip()), float(p[4].strip())))
symbols = df.map(lambda x: Row(symbol=x[0], time=x[1], price=x[2], volume=x[3]))
schemaSymbols = sqlContext.inferSchema(symbols)
schemaSymbols.registerTempTable("symbols")
trades = sqlContext.sql("""SELECT symbol, time, sum(price*volume)/sum(volume) as price, sum(volume) as volume from
symbols group by symbol, time""")
trades = trades.map(lambda x: Row(symbol=x[0], time=x[1], price=x[2], volume=x[3]))
schemaTrades = sqlContext.inferSchema(trades)
schemaTrades.registerTempTable("trades")
# remove limit after test
syms = sqlContext.sql("SELECT distinct symbol from trades")
syms = syms.collect()
df_dict = {}
print type(syms)
for sym in syms:
sym = sym.symbol.strip()
print sym
sym_data = sqlContext.sql("SELECT symbol, time, price, volume FROM trades WHERE symbol = '{}' ORDER BY symbol, time".format(sym))
sym_data = sym_data.collect()
print len(sym_data)
sym_df = pd.DataFrame(sym_data, columns=['symbol', 'time', 'price', 'volume'])
# Predictive model did not like original volume values, so use rescaled value
sym_df['volume10k'] = np.round(sym_df['volume'] / 10000, 3)
for i in range(1,11):
sym_df['price_t-'+str(i)] = sym_df['price'].shift(i)
for i in range(1,11):
#sym_df['volume_t-'+str(i)] = sym_df['volume'].shift(i)
sym_df['volume10k_t-'+str(i)] = sym_df['volume10k'].shift(i)
# add labels for price and volume
sym_df['price_label'] = sym_df['price'].shift(-1)
sym_df['volume_label'] = sym_df['volume'].shift(-1)
sym_df['price_dir_label'] = np.where(sym_df.price_label > sym_df.price, 1, 0)
sym_df['volume_dir_label'] = np.where(sym_df.volume_label > sym_df.volume, 1, 0)
sym_df = sym_df.dropna()
df_dict[sym] = sym_df
print sym_df
train(sym,sym_df)
# print for testing
print len(df_dict)
print df_dict.keys()
print type(df_dict[sym])
示例3: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
def main():
conf = SparkConf().setAppName('symbols').set("spark.storage.blockManagerSlaveTimeoutMs", 60000)
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
data = sc.textFile("hdfs://spark1:9000/user/convert_out/ct_20110218.csv", 200).map(lambda line: line.split(",")).cache()
rows = data.filter(lambda x: x[0] != 'SYMBOL')
df = rows.map(lambda p: (p[0].strip(), transform_time(p[1].strip(), p[2].strip()), float(p[3].strip()), float(p[4].strip())))
#df = df.filter(lambda x: x[1] != 0)
symbols = df.map(lambda x: Row(symbol=x[0], time=x[1], price=x[2], volume=x[3]))
schemaSymbols = sqlContext.inferSchema(symbols)
schemaSymbols.registerTempTable("symbols")
trades = sqlContext.sql("""SELECT symbol, time, sum(price*volume)/sum(volume) as price, sum(volume) as volume from
symbols group by symbol, time""")
trades = trades.map(lambda x: Row(symbol=x[0], time=x[1], price=x[2], volume=x[3]))
schemaTrades = sqlContext.inferSchema(trades)
schemaTrades.registerTempTable("trades")
# remove limit after test
syms = sqlContext.sql("SELECT distinct symbol from trades")
syms = syms.collect()
df_dict = {}
print type(syms)
for sym in syms:
sym = sym.symbol.strip()
print sym
sym_data = sqlContext.sql("SELECT symbol, time, price, volume FROM trades WHERE symbol = '{}' ORDER BY symbol, time".format(sym))
sym_data = sym_data.collect()
print len(sym_data)
sym_df = pd.DataFrame(sym_data, columns=['symbol', 'time', 'price', 'volume'])
for i in range(1,11):
sym_df['price_t-'+str(i)] = sym_df['price'].shift(i)
for i in range(1,11):
sym_df['volume_t-'+str(i)] = sym_df['volume'].shift(i)
# add labels for price and volume
sym_df['price_label'] = sym_df['price'].shift(-1)
sym_df['volume_label'] = sym_df['volume'].shift(-1)
sym_df['price_label'] = np.where(sym_df.price_label > sym_df.price, 1, 0)
sym_df['volume_label'] = np.where(sym_df.volume_label > sym_df.volume, 1, 0)
sym_df = sym_df.dropna()
df_dict[sym] = sym_df
print sym_df
# print for testing
print len(df_dict)
print df_dict.keys()
print type(df_dict[sym])
sc.stop()
示例4: test_infer_schema
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
def test_infer_schema(self):
sqlCtx = SQLContext(self.sc)
rdd = self.sc.parallelize([LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)])
srdd = sqlCtx.inferSchema(rdd)
schema = srdd.schema()
field = [f for f in schema.fields if f.name == "features"][0]
self.assertEqual(field.dataType, self.udt)
vectors = srdd.map(lambda p: p.features).collect()
self.assertEqual(len(vectors), 2)
for v in vectors:
if isinstance(v, SparseVector):
self.assertEqual(v, self.sv1)
elif isinstance(v, DenseVector):
self.assertEqual(v, self.dv1)
else:
raise ValueError("expecting a vector but got %r of type %r" % (v, type(v)))
示例5: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
def main(sc):
sqlContext = SQLContext(sc)
songRdd = sc.textFile("processedSongData/")
# Load a text file and convert each line to a Row.
individualSong = songRdd.map(lambda l:l.split('\t'))
songData = individualSong.map(lambda p: Row(trackId=p[0], loudness=(float(p[1]) if p[1] != u'' else 0), songId=p[36], title=p[43], pitches=p[32], timbre=p[33]))
# Infer the schema, and register the DataFrame as a table.
schemaSongData = sqlContext.inferSchema(songData)
schemaSongData.registerTempTable("songData")
#test1 = sqlContext.sql("SELECT * FROM userTaste WHERE playCount >= 5 AND playCount <= 10")
test1 = sqlContext.sql("SELECT * FROM songData WHERE songId = ''")
songIds = test1.map(lambda p: "songIds: " + p.songId)
for i in songIds.collect():
print i
示例6: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
def main(sc):
sqlContext = SQLContext(sc)
tasteProfileRdd = sc.textFile("insertsDir/*")
# Load a text file and convert each line to a Row.
tasteProfile = tasteProfileRdd.filter(lambda l:len(l) > 0)
parsedSplits = tasteProfile.map(lambda l: l.split('|'))
userTaste = parsedSplits.map(lambda p: Row(songId=p[0], songName=p[1], artistName=p[2], playCount=p[3], lastModified=p[4], dateAdded=p[5], artistId=p[6], foreignId=p[7], catalogId=p[8]))
# Infer the schema, and register the DataFrame as a table.
schemaUserTaste = sqlContext.inferSchema(userTaste)
schemaUserTaste.registerTempTable("userTaste")
test1 = sqlContext.sql("SELECT * FROM userTaste WHERE songId = 'None'")
songIds = test1.map(lambda p: "songIds: " + p.songId)
for i in songIds.collect():
print i
示例7: SparkConf
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
srcPath="/app/ecom/fc-star/wanggongzheng/wordtxt"
resPath="/app/ecom/fc-star/wanggongzheng/wordres"
appName="word count test"
#init sqlContext
conf = SparkConf().setAppName(appName)
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
#create rdd from hadoop txt file
textRdd = sc.textFile(srcPath)
#map text word line to (word,1)
wordSplit = textRdd.flatMap(lambda line: line.split()).map(lambda word: (word, 1))
#reducebyKey to get word count
wordCounts = wordSplit.reduceByKey(lambda a, b: a+b)
#create row based rdd
rowRdd = wordCounts.map(lambda x: Row(word=x[0],wc=x[1]))
wordFrames = sqlContext.inferSchema(rowRdd)
wordFrames.registerTempTable("tword")
top10Frames = sqlContext.sql("select word,wc FROM tword order by wc desc limit 10")
print top10Frames.collect()
示例8: len
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
assert len(sys.argv) == 2
from pyspark import SparkContext
sc = SparkContext(appName="BDB")
# sc is an existing SparkContext.
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
# Load a text file and convert each line to a dictionary.
lines = sc.textFile("../../../../bdb/rankings/rankings.txt")
parts = lines.map(lambda l: l.split(","))
rankings = parts.map(lambda p: {"pageURL": p[0], "pageRank": int(p[1]), "avgDuration":int(p[2])})
schemaRanking = sqlContext.inferSchema(rankings)
schemaRanking.registerAsTable("rankings")
print "Running query: ", sys.argv[1]
num_query = int(sys.argv[1])
print "Num_query: " + str(num_query) + "\n"
if num_query == 1:
urls = sqlContext.sql("SELECT pageURL, pageRank FROM rankings WHERE pageRank > 10")
elif num_query == 2:
urls = sqlContext.sql("SELECT pageURL, pageRank FROM rankings WHERE pageRank > 100")
elif num_query == 3:
urls = sqlContext.sql("SELECT pageURL, pageRank FROM rankings WHERE pageRank > 1000")
elif num_query == 4:
urls = sqlContext.sql("SELECT pageURL, pageRank FROM rankings WHERE pageRank > 1000")
示例9: SparkContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
import sys
from operator import add
from pyspark import SparkContext
from datetime import datetime, timedelta
import datetime
from pyspark.sql import SQLContext, Row
#test comment
#test comment1
#test comment2
sc = SparkContext(appName="claimCount")
sqlContext = SQLContext(sc)
lines = sc.textFile("/data/claim_data/claimdata.txt")
parts = lines.map(lambda l: l.split("|"))
claims = parts.map(lambda p: Row(enroll_id = p[2],claim_id =p[1], place_of_service_cd=p[12],diag_cd =p[75],diag_position = p[73],procedure_cd=p[47]))
schemaPeople = sqlContext.inferSchema(claimdata)
schemaPeople.registerTempTable("people")
members = sqlContext.sql("SELECT enroll_id FROM claims WHERE place_of_service_cd ='23' AND diag_cd = '4423' and diag_position='1' and procedure_cd= 'A9579'")
enroll_ids = members.map(lambda p: p.enroll_id)
for enroll_id in enroll_ids.collect():
print enroll_id
示例10: SparkContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
# Create a Spark context and set it to work
with SparkContext(conf=conf) as sc:
# Read the parsed records
directory = "hdfs:///user/{0}/data/tvlogs/".format( sc.sparkUser() )
logs = sc.textFile( "{0}{1}".format(directory,name) ).map( lambda x: x.split(',') )
# Turn each record into a Row object
logRows = logs.map( lambda p: Row( user=p[0], date=p[1], type=p[4], content=p[5]) )
# Create an SQL context
sqlContext = SQLContext(sc)
# Infer the schema. It will do so by looking at the first 100 rows
logSchema = sqlContext.inferSchema( logRows )
# Register the SchemaRDD as a table.
logSchema.registerTempTable( "logs" )
# We can now perform SQL queries over the "logs" table
# An example query: find out who connected on Aug 8th, when they
# first connected, and how many events we've got on that day for each one of those users
data = sqlContext.sql("""
SELECT user, min(date) AS start, count(*) as num
FROM logs
WHERE date > '2014-08-01' AND date < '2014-08-02'
GROUP by user ORDER BY start""" )
# The results of SQL queries are RDDs and support all the normal RDD operations.
示例11: SparkContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row, StructField, StructType, StringType, IntegerType
if __name__ == "__main__":
sc = SparkContext(appName="PythonSQL")
sqlContext = SQLContext(sc)
# RDD is created from a list of rows
some_rdd = sc.parallelize([Row(name="John", age=19),
Row(name="Smith", age=23),
Row(name="Sarah", age=18)])
# Infer schema from the first row, create a DataFrame and print the schema
some_df = sqlContext.inferSchema(some_rdd)
some_df.printSchema()
# Another RDD is created from a list of tuples
another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)])
# Schema with two fields - person_name and person_age
schema = StructType([StructField("person_name", StringType(), False),
StructField("person_age", IntegerType(), False)])
# Create a DataFrame by applying the schema to the RDD and print the schema
another_df = sqlContext.applySchema(another_rdd, schema)
another_df.printSchema()
# root
# |-- age: integer (nullable = true)
# |-- name: string (nullable = true)
# A JSON dataset is pointed to by path.
示例12: f1
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
lines = sc.textFile("/data/train/monitorData/part-00001")
def f1(x):
if x=='IMP':
return 1
else:
return 0
def f2(x):
if x=='CLK':
return 1
else:
return 0
parts = lines.map(lambda l: l.split("^"))
#convert each line to a dictionary
record = parts.map(lambda p: Row(user=p[0],isValid=p[1], date=p[-2][0:8],IMP=int(f1(p[-1])),CLK=int(f2(p[-1]))))
# Infer the schema, and register the SchemaRDD as a table.
schemaRecord = sqlContext.inferSchema(record)
schemaRecord.registerTempTable("train_record")
# SQL can be run over SchemaRDDs that have been registered as a table.
trainData=sqlContext.sql("SELECT user,date,sum(IMP) as SUM_IMP, sum(CLK) as SUM_CLK FROM train_record where isValid='1' group by user,date ")
line1 = sc.textFile("/data/train/transformData/*")
part1 = line1.map(lambda l: l.split("^"))
label = part1.map(lambda p: Row(user=p[0],date=p[1][0:8])) # get train label
schemaLabel = sqlContext.inferSchema(label)
schemaLabel.registerTempTable("train_label")
trainData.registerTempTable("trainData")#noted:before use trainData to left outer join train_label,the trainData must be registered to a table
trainData1=sqlContext.sql("select a.user,a.date as date,SUM_IMP,SUM_CLK,b.date as label from trainData a left outer join train_label b on (a.user=b.user and a.date=b.date)")
def f(x):
if x=='None':
return 0
else:
return 1
示例13: sync_table
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
ad_camp_bid = sqlContext.jsonFile(path1)
path2 = "hdfs://ec2-52-8-165-110.us-west-1.compute.amazonaws.com:9000/user/AdReport/ads_info/history"
ad_camp_stats = sqlContext.jsonFile(path2)
#ad_camp_bid.printSchema()
#ad_camp_stats.printSchema()
sync_table(cost_table)
# Register this SchemaRDD as a table.
ad_camp_bid.registerTempTable("ad_camp_bid")
ad_camp_stats.registerTempTable("ad_camp_stats")
# SQL statements can be run by using the sql methods provided by sqlContext.
#info1 = sqlContext.sql("SELECT account_id, bid_type, targeting.age_max, targeting.age_min FROM ad_camp_bid").collect()
#info2 = sqlContext.sql("SELECT campaign_id,actions_per_impression,clicks,cost_per_unique_click,date_start,date_stop FROM ad_camp_stats").collect()
info3 = sqlContext.sql("SELECT ad_camp_bid.account_id, ad_camp_bid.id, ad_camp_bid.bid_type, ad_camp_stats.clicks, ad_camp_stats.cost_per_unique_click,ad_camp_stats.cost_per_result,ad_camp_stats.cost_per_total_action,ad_camp_stats.result_rate,ad_camp_stats.date_start FROM ad_camp_bid INNER JOIN ad_camp_stats ON ad_camp_bid.id = ad_camp_stats.campaign_id and ad_camp_bid.bid_type IS NOT NULL").collect()
#info4 = sqlContext.sql("SELECT bid_type, COUNT(bid_type) FROM ad_camp_bid GROUP BY bid_type").collect()
del info3[-1]
infoRDD = sqlContext.inferSchema(info3)
infoRDD.registerTempTable("my_table")
q = sqlContext.sql("SELECT bid_type, AVG(cost_per_result) AS avg_cost_per_result, AVG(cost_per_total_action) AS avg_cost_per_action FROM my_table GROUP BY bid_type").collect()
#infodf= pd.DataFrame(q)
#print (infodf.head(10))
for info in q:
cost_table.create(bid_type=info[0], cost_per_result=info[1], cost_per_action=info[2])
print ("finished creating table: cost table")
示例14: open
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
t1 = time.time()
dArrPos = {}
nFeatures = 0
revLookup = []
fDict = open(dictFile,"r")
for line in fDict:
terms = line.split("\t")
dArrPos[terms[0]]=(int(terms[1]), int(terms[2]))
revLookup.append(terms[0])
bc_dIDF = sc.broadcast(dArrPos)
nVecLen = len(revLookup)
t2 = time.time()
diff = t2-t1
print "Time to perform idf calc: ", diff
# Create aggregate vectors from in region data
t1 = time.time()
grouped = aggregatedComparison.createAggregatedLabledPoint(records, bUseDate, fBinSize, bc_dIDF, True, bc_lStopWords, nGoodTweets, 1., nMinClusterUnique)\
.map(lambda records: aggregatedComparison.mapForPrecomp(records, bUseDate, fBinSize))\
.cache()
nTotal = grouped.count()
df = sqlContext.inferSchema(grouped)
t2 = time.time()
print nTotal, "entries for dataset"
diff = t2-t1
print "Time to get data ready for model by time", diff
df.repartition(outPart).saveAsParquetFile(outputPath)
print "<-----BOOM------->"
示例15: SparkContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import inferSchema [as 别名]
else:
assert False, "Invalid Query Parameter - {}".format(pair)
else:
assert False, "Unhandled Options - %s" %o
sc = SparkContext(appName="TA Query")
sqlContext = SQLContext(sc)
#load the target dataset
targetFile = "hdfs://cdh4-n.migosoft.com/user/athena/ta/{caldate}".format(caldate=CalDate)
textFile = sc.textFile(targetFile)
data = textFile.map(lambda l: l.split("\t"))
ta_raw = data.map(lambda p: Row(Store_id=p[0], CalDate=p[1], PeriodType=p[2], member_id=p[3], Value1=p[4], Value2=p[5], Value3=p[6]))
schemaTag= sqlContext.inferSchema(ta_raw)
schemaTag.registerTempTable("ta_raw")
rules = " and ".join(["{} = '{}'".format(k, v) for k, v in rule_dict.items() if v or v != "NULL"])
output_path = "_".join(["{}".format(v) for k, v in rule_dict.items() if v or v != "NULL"])
cmd = "SELECT member_id FROM ta_raw WHERE {} LIMIT 12".format(rules)
query_info = sqlContext.sql(cmd)
output = query_info.map(lambda p: p.member_id)
rule_dict["Member"] = output.collect()
output_File = "/home/erica_li/tmp/{}.txt".format(output_path)
if os.path.exists(output_File):
os.remove(output_File)
print "remove folder already"