本文整理汇总了Python中pyspark.sql.SQLContext.cacheTable方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.cacheTable方法的具体用法?Python SQLContext.cacheTable怎么用?Python SQLContext.cacheTable使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.SQLContext
的用法示例。
在下文中一共展示了SQLContext.cacheTable方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import cacheTable [as 别名]
def main(sc):
sqlContext = SQLContext(sc)
taxiFile = sc.textFile("taxizip/taxizipaa.csv")
header = taxiFile.first()
taxiHeader = taxiFile.filter(lambda l: "vendor_id" in l)
taxiNoHeader = taxiFile.subtract(taxiHeader)
taxi_temp = taxiNoHeader.map(lambda k: k.split(","))
taxi_rdd = taxi_temp.map(lambda p: Row(vendor_id=p[0],
pickup_datetime=datetime.strptime(p[1], "%Y-%m-%d %H:%M:%S"),
dropoff_datetime=datetime.strptime(p[2], "%Y-%m-%d %H:%M:%S"),
passenger_count=int(p[3] if p[3]!="" else 0),
trip_distance=float(p[4] if p[4]!="" else 0),
pickup_longitude=float(p[5] if p[5]!="" else 0) ,
pickup_latitude=float(p[6] if p[6]!="" else 0),
rate_code=p[7],
store_and_fwd_flag=p[8],
dropoff_longitude=float(p[9] if p[9]!="" else 0),
dropoff_latitude=float(p[10] if p[10]!="" else 0),
payment_type=p[11],
fare_amount=float(p[12] if p[12]!="" else 0),
surcharge=float(p[13] if p[13]!="" else 0),
mta_tax=float(p[14] if p[14]!="" else 0),
tip_amount=float(p[15] if p[15]!="" else 0),
tolls_amount=float(p[16] if p[16]!="" else 0),
total_amount=float(p[17] if p[17]!="" else 0),
zipcode=p[18]))
taxi_df = sqlContext.createDataFrame(taxi_rdd)
taxi_df.registerTempTable("taxi")
sqlContext.registerFunction("to_hour", lambda x: x.hour)
sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day) + "-" + str(x.year))
th = sqlContext.sql("SELECT to_hour(dropoff_datetime) as hour, dropoff_datetime as trip_date, dropoff_longitude as lng,dropoff_latitude as lat,zipcode FROM taxi where dropoff_longitude!=0 and dropoff_latitude!=0")
th.registerTempTable("taxi_hr")
sqlContext.cacheTable("taxi_hr")
grouped_taxi = sqlContext.sql("select hour, zipcode,str_date(trip_date), count(*) as c from taxi_hr group by hour,zipcode,str_date(trip_date) order by c desc")
grouped_taxi.show(100)
#save this intermediate result to a file as csv
grouped_csv = grouped_taxi.map(toCSV)
grouped_csv.saveAsTextFile('results')
grouped_taxi.registerTempTable("taxi_grouped")
sqlContext.cacheTable("taxi_grouped")
示例2: SparkContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import cacheTable [as 别名]
sc = SparkContext("local[*]", "Simple App")
#sc = SparkContext("spark://url:7077", "Simple App")
sqlContext = SQLContext(sc)
sqlContext.setConf("spark.sql.shuffle.partitions", "5")
# issue movies query
conf = {"es.resource" : "movies2/logs", "es.query" : "?q=name:picture"}
movies = sc.newAPIHadoopRDD("org.elasticsearch.hadoop.mr.EsInputFormat",\
"org.apache.hadoop.io.NullWritable", "org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=conf)
# place results in table
moviesRows = movies.map(lambda p: Row(id=int(p[1]['id']), name=p[1]['name']))
moviesRowsList = moviesRows.collect()
schemaMovies = sqlContext.createDataFrame(moviesRowsList)
schemaMovies.registerTempTable("movies")
sqlContext.cacheTable("movies")
# get ids in order to form acted_in query
ids = []
for moviesRow in moviesRowsList:
ids.append(moviesRow['id'])
movieIdSnippets = []
for id in ids:
movieIdSnippets.append("movie_id:" + str(id))
# partition acted_in query
actedInRowsTotalList = []
movieIdSnippetsChunks = list(chunks(movieIdSnippets, 1000))
for chunk in movieIdSnippetsChunks:
movieIdQuery = " OR ".join(chunk)
conf = {"es.resource" : "acted_in2/logs", "es.query" : "?q=" + movieIdQuery, "es.size" : "10000"}
示例3: in
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import cacheTable [as 别名]
Sentiment = sqlContext.read.parquet("swift://notebooks.spark/sentiment.parquet")
# In[62]:
print Gender
# In[37]:
followers.registerTempTable("followers");
# In[38]:
sqlContext.cacheTable("followers")
plt1 = sqlContext.sql("SELECT * FROM followers where followers not in (2870776) order by userDisplayName desc")
# In[39]:
df1 = plt1.toPandas()
df1=df1.set_index('userDisplayName')
# In[75]:
from pylab import rcParams
rcParams['figure.figsize'] = 20,10
Line=df1.plot(kind='line',title='Count by followers of users',stacked=False)
Line.set_ylabel("No.of follwers")
示例4: SQLContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import cacheTable [as 别名]
sqlContext = SQLContext(sc)
# In[4]:
FOLLCOUNT = sqlContext.read.parquet("FOLLOWERS.PARQUET")
# In[5]:
FOLLCOUNT.registerTempTable("FOLLCOUNT");
# In[20]:
sqlContext.cacheTable("FOLLCOUNT")
Q1 = sqlContext.sql("SELECT * FROM FOLLCOUNT WHERE F_COUNT > 1000000")
# In[21]:
F1 = Q1.toPandas()
F1=F1.set_index('USER_DISPLAY_NAME')
# In[22]:
from pylab import rcParams
import pylab
rcParams['figure.figsize'] = 20,10
Line=F1.plot(kind='line',title='Count by followers of users',stacked=False)
示例5: SQLContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import cacheTable [as 别名]
sqlContext = SQLContext(sc)
# In[24]:
gender = sqlContext.read.parquet("gender.parquet")
# In[25]:
gender.registerTempTable("gender");
# In[33]:
sqlContext.cacheTable("gender")
plt1 = sqlContext.sql("SELECT * FROM gender order by 2")
# In[34]:
df1 = plt1.toPandas()
df1=df1.set_index('USER_GENDER')
# In[63]:
from pylab import rcParams
rcParams['figure.figsize'] = 15,5
barh=df1.plot(kind='barh',title='Count of gender speaking about Dance',stacked=False,color='b')
barh.set_ylabel("Gender")
示例6: timeslot
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import cacheTable [as 别名]
def timeslot(f):
time24=datetime.datetime.strptime(f,'%I:%M:%S %p').strftime('%X')
timesl=int(time24[0:2])/3
timesl=timesl+1#divided time into 8 slots
return timesl
schemaString="day timeslot block crimetype latitude longitude"
reformattedCrime=crimeData.map(lambda line: [date2dayofweek(line[1].split(' ',1)[0]),timeslot(line[1].split(' ',1)[1]),line[2].split(' ',1)[1],line[3],line[4],line[5]])
schemaCrime = sqlContext.createDataFrame(reformattedCrime, ['day','timeslot','block','crimetype','latitude','longitude'])
schemaCrime.registerTempTable("chicagocrimedata")
sqlContext.cacheTable("chicagocrimedata")
timeMatrix=sqlContext.sql("SELECT crimetype,timeslot,count(*) AS countPerTime FROM chicagocrimedata group by crimetype,timeslot order by crimetype")
#Extract all classes. Here, distinct crime types
CrimeTypes = sqlContext.sql("SELECT distinct(crimetype) AS crimetypes FROM chicagocrimedata order by crimetypes").collect()
allCrimeTypes = list()
for index in range(len(CrimeTypes)):
allCrimeTypes.append(CrimeTypes[index][0])
#Extracting statistics of crimes top 10
crimeCounts=sqlContext.sql("SELECT crimetype,count(*) as crimeCount FROM chicagocrimedata GROUP BY crimetype order by crimeCount desc LIMIT 10").collect()
countByCrimeType = {}
示例7:
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import cacheTable [as 别名]
jsonFile = sqlContext.read.json("swift://notebooks.spark/lobbying.json")
# In[3]:
print jsonFile
# In[4]:
jsonFile.registerTempTable("lobbyings");
# In[5]:
sqlContext.cacheTable("lobbyings")
# In[6]:
lobbyings = sqlContext.sql("SELECT * FROM lobbyings")
# In[7]:
lobbyings.cache()
# In[8]:
lobbyings.printSchema()
示例8: SparkConf
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import cacheTable [as 别名]
from pyspark.sql import SQLContext, Row
conf = SparkConf().setAppName('TriWordCount')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
trigrams = sc.textFile('hdfs:///users/rocks1/12307130174/spark_probabilities_smoothed01/*')
trigrams = trigrams.map(lambda line: eval(line)) \
.map(lambda t: Row(word0 = t[0][0], word1=t[0][1], word2=t[0][2], prob=t[1]))
schemaTrigram= sqlContext.createDataFrame(trigrams)
schemaTrigram.registerTempTable("trigram")
sqlContext.cacheTable("trigram")
#schemaTrigram.cache()
import socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(("",54899))
s.listen(5)
while True:
#word0, word1 = raw_input(">").split()
print "in loop"
client, _ = s.accept()
print "acccpeted"
recved = client.recv(1024)
print "recived"