当前位置: 首页>>代码示例>>Python>>正文


Python SQLContext.sql方法代码示例

本文整理汇总了Python中pyspark.sql.SQLContext.sql方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.sql方法的具体用法?Python SQLContext.sql怎么用?Python SQLContext.sql使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.SQLContext的用法示例。


在下文中一共展示了SQLContext.sql方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def main():
  reviews_parquet = sys.argv[1]
  metadata_parquet = sys.argv[2]
  users_ascores_file = sys.argv[3]
  products_ascores_file = sys.argv[4]

  conf = SparkConf().setAppName('Amazon Cassandra Injector').setMaster("local").set("spark.cassandra.connection.host", "localhost")
  sc = SparkContext(conf=conf)
  sqlContext = SQLContext(sc)

  sqlContext.read.parquet(reviews_parquet).registerTempTable('amazon_reviews')
  reviews = sqlContext.sql("""SELECT * FROM amazon_reviews""").rdd.cache()
  reviews_by_reviewer = reviews.map(process_review).map(lambda j: (j["reviewerid"], j))
  users_ascores = sc.textFile(users_ascores_file).map(ast.literal_eval).map(lambda (r_id, score, histo): (r_id, (score, histo)))
  reviews_joined = reviews_by_reviewer.join(users_ascores).map(lambda (reviewerid, (j, (score, histo))): fillin_review(j, score))
  # join with meth2_users_ascores. join on reviewerid -> ascore is reviewer ascore
  reviews_joined.saveToCassandra("amzdb", "reviews")

  # reviewers need their alternative score
  reviewers = reviews.map(process_reviewer).map(lambda j: (j["reviewerid"], j))
  # join with meth2_user_ascores. Get ascore and overall_histogram
  reviewers_joined = reviewers.join(users_ascores).map(lambda (reviewerid, (j, (score, histo))): fillin_reviewer(j, score, histo))
  reviewers_joined.saveToCassandra("amzdb", "reviewers")

  # products need their overall score/histogram, and adjuted score/histogram
  sqlContext.read.parquet(metadata_parquet).registerTempTable('amazon_metadata')
  products = sqlContext.sql("""SELECT * FROM amazon_metadata""").rdd.map(process_product).map(lambda j: (j["asin"], j))
  # join with meth2_product_ascores
  products_ascores = sc.textFile(products_ascores_file).map(ast.literal_eval).map(lambda (asin, o_s, a_s, o_h, a_h, n): (asin, (o_s, o_h, a_s, a_h)))
  products_joined = products.join(products_ascores).map(lambda (asin, (j, (o_s, o_h, a_s, a_h))): fillin_product(j, o_s, o_h, a_s, a_h))
  products_joined.saveToCassandra("amzdb", "products")
开发者ID:kyledemeule,项目名称:cmpt-732-amazon-review-analysis,代码行数:33,代码来源:cassandra_upload.py

示例2: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def main(sc):
    	path = "events"
    	#text_file = sc.textFile(path)
    	sqlContext = SQLContext(sc)
    	events = sqlContext.jsonFile(path)

	events = events.select(events["events.event"]).flatMap(lambda p: p.event)
	events = events.map(lambda p: Row(
		id=p.id,\
		title=p.title, \
		lat=p.latitude, \
		long=p.longitude, \
		postal_code=p.postal_code, \
		start_time=datetime.strptime(p.start_time, "%Y-%m-%d %H:%M:%S"), \
		stop_time=p.stop_time)) 	
	events_df = sqlContext.createDataFrame(events)
	
	events_df.registerTempTable("events")

	sqlContext.registerFunction("to_hour", lambda x: x.hour)
	sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day) + "-" + str(x.year))

	e = sqlContext.sql("select title, str_date(start_time) as event_date,
	to_hour(start_time) as hour, postal_code from events where postal_code is not null and start_time is not null")

	events_grouped = sqlContext.sql("select event_date, hour, postal_code, 
	count(*) from events_filtered group by event_date,hour,postal_code order by postal_code,hour")

	grouped_csv = events_grouped.map(toCSV)
	grouped_csv.saveAsTextFile('events_cluster')
开发者ID:Narasimman,项目名称:Most-hapennning-places-NYC,代码行数:32,代码来源:parse_events.py

示例3: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def main():
    sqlContext = SQLContext(sc)
    data = sc.textFile("hdfs://spark1:9000/user/convert_out/ct_20110218.csv", 200).map(lambda line: line.split(","))
    rows = data.filter(lambda x: x[0] != 'SYMBOL')
    df = rows.map(lambda p: (p[0].strip(), transform_time(p[1].strip(), p[2].strip()), float(p[3].strip()), float(p[4].strip())))

    symbols = df.map(lambda x: Row(symbol=x[0], time=x[1], price=x[2], volume=x[3]))
    schemaSymbols = sqlContext.inferSchema(symbols)
    schemaSymbols.registerTempTable("symbols")

    trades = sqlContext.sql("""SELECT symbol, time, sum(price*volume)/sum(volume) as price, sum(volume) as volume from
            symbols group by symbol, time""")
    trades = trades.map(lambda x: Row(symbol=x[0], time=x[1], price=x[2], volume=x[3]))
    schemaTrades = sqlContext.inferSchema(trades)
    schemaTrades.registerTempTable("trades")

    # remove limit after test
    syms = sqlContext.sql("SELECT distinct symbol from trades")
    syms = syms.collect()

    df_dict = {}
    print type(syms)
    for sym in syms:
        sym = sym.symbol.strip()
        print sym
        sym_data = sqlContext.sql("SELECT symbol, time, price, volume FROM trades WHERE symbol = '{}' ORDER BY symbol, time".format(sym))

        sym_data = sym_data.collect()
        print len(sym_data)
        sym_df = pd.DataFrame(sym_data, columns=['symbol', 'time', 'price', 'volume'])

        # Predictive model did not like original volume values, so use rescaled value
        sym_df['volume10k'] = np.round(sym_df['volume'] / 10000, 3)

        for i in range(1,11):
            sym_df['price_t-'+str(i)] = sym_df['price'].shift(i)

        for i in range(1,11):
            #sym_df['volume_t-'+str(i)] = sym_df['volume'].shift(i)
            sym_df['volume10k_t-'+str(i)] = sym_df['volume10k'].shift(i)

        # add labels for price and volume
        sym_df['price_label'] = sym_df['price'].shift(-1)
        sym_df['volume_label'] = sym_df['volume'].shift(-1)

        sym_df['price_dir_label'] = np.where(sym_df.price_label > sym_df.price, 1, 0)
        sym_df['volume_dir_label'] = np.where(sym_df.volume_label > sym_df.volume, 1, 0)

        sym_df = sym_df.dropna()
        df_dict[sym] = sym_df
        print sym_df

        train(sym,sym_df)

    # print for testing
    print len(df_dict)
    print df_dict.keys()
    print type(df_dict[sym])
开发者ID:redame,项目名称:quote_streaming,代码行数:60,代码来源:build_and_train.py

示例4: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def main():
    conf = SparkConf().setAppName('symbols').set("spark.storage.blockManagerSlaveTimeoutMs", 60000)
    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)
    data = sc.textFile("hdfs://spark1:9000/user/convert_out/ct_20110218.csv", 200).map(lambda line: line.split(",")).cache()
    rows = data.filter(lambda x: x[0] != 'SYMBOL')
    df = rows.map(lambda p: (p[0].strip(), transform_time(p[1].strip(), p[2].strip()), float(p[3].strip()), float(p[4].strip()))) 
    #df = df.filter(lambda x: x[1] != 0)    

    symbols = df.map(lambda x: Row(symbol=x[0], time=x[1], price=x[2], volume=x[3]))
    schemaSymbols = sqlContext.inferSchema(symbols)
    schemaSymbols.registerTempTable("symbols")
    
    trades = sqlContext.sql("""SELECT symbol, time, sum(price*volume)/sum(volume) as price, sum(volume) as volume from
            symbols group by symbol, time""")
    trades = trades.map(lambda x: Row(symbol=x[0], time=x[1], price=x[2], volume=x[3]))
    schemaTrades = sqlContext.inferSchema(trades)
    schemaTrades.registerTempTable("trades")

    # remove limit after test
    syms = sqlContext.sql("SELECT distinct symbol from trades")
    syms = syms.collect()
    
    df_dict = {}
    print type(syms)
    for sym in syms:
        sym = sym.symbol.strip()
        print sym
        sym_data = sqlContext.sql("SELECT symbol, time, price, volume FROM trades WHERE symbol = '{}' ORDER BY symbol, time".format(sym))
        
        sym_data = sym_data.collect()
        print len(sym_data)
        sym_df = pd.DataFrame(sym_data, columns=['symbol', 'time', 'price', 'volume'])
        for i in range(1,11):
            sym_df['price_t-'+str(i)] = sym_df['price'].shift(i)

        for i in range(1,11):
            sym_df['volume_t-'+str(i)] = sym_df['volume'].shift(i)

        # add labels for price and volume
        sym_df['price_label'] = sym_df['price'].shift(-1)
        sym_df['volume_label'] = sym_df['volume'].shift(-1)
        
        sym_df['price_label'] = np.where(sym_df.price_label > sym_df.price, 1, 0)
        sym_df['volume_label'] = np.where(sym_df.volume_label > sym_df.volume, 1, 0)


        sym_df = sym_df.dropna()
        df_dict[sym] = sym_df
        print sym_df

    # print for testing
    print len(df_dict)
    print df_dict.keys()
    print type(df_dict[sym])
    sc.stop()
开发者ID:redame,项目名称:quote_streaming,代码行数:59,代码来源:spark_model.py

示例5: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def main(sc):
    sqlContext = SQLContext(sc)
    taxiFile = sc.textFile("taxizip/taxizipaa.csv")
    header = taxiFile.first()

    taxiHeader = taxiFile.filter(lambda l: "vendor_id" in l)
    taxiNoHeader = taxiFile.subtract(taxiHeader)

    taxi_temp = taxiNoHeader.map(lambda k: k.split(","))

    taxi_rdd = taxi_temp.map(lambda p: Row(vendor_id=p[0],
    pickup_datetime=datetime.strptime(p[1], "%Y-%m-%d %H:%M:%S"),
    dropoff_datetime=datetime.strptime(p[2], "%Y-%m-%d %H:%M:%S"),
    passenger_count=int(p[3] if p[3]!="" else 0),
    trip_distance=float(p[4] if p[4]!="" else 0),
    pickup_longitude=float(p[5] if p[5]!="" else 0) ,
    pickup_latitude=float(p[6] if p[6]!="" else 0),
    rate_code=p[7],
    store_and_fwd_flag=p[8],
    dropoff_longitude=float(p[9] if p[9]!="" else 0),
    dropoff_latitude=float(p[10] if p[10]!="" else 0),
    payment_type=p[11],
    fare_amount=float(p[12] if p[12]!="" else 0),
    surcharge=float(p[13] if p[13]!="" else 0),
    mta_tax=float(p[14] if p[14]!="" else 0),
    tip_amount=float(p[15] if p[15]!="" else 0),
    tolls_amount=float(p[16] if p[16]!="" else 0),
    total_amount=float(p[17] if p[17]!="" else 0),
    zipcode=p[18]))

    taxi_df = sqlContext.createDataFrame(taxi_rdd)

    taxi_df.registerTempTable("taxi")

    sqlContext.registerFunction("to_hour", lambda x: x.hour)
    sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day) + "-" + str(x.year))
 
    th = sqlContext.sql("SELECT to_hour(dropoff_datetime) as hour, dropoff_datetime as trip_date, dropoff_longitude as lng,dropoff_latitude as lat,zipcode FROM taxi where dropoff_longitude!=0 and dropoff_latitude!=0")

    th.registerTempTable("taxi_hr")
    sqlContext.cacheTable("taxi_hr")

    grouped_taxi = sqlContext.sql("select hour, zipcode,str_date(trip_date), count(*) as c from taxi_hr group by hour,zipcode,str_date(trip_date) order by c desc")
    grouped_taxi.show(100)
 
    #save this intermediate result to a file as csv
    grouped_csv = grouped_taxi.map(toCSV)
    grouped_csv.saveAsTextFile('results')

    grouped_taxi.registerTempTable("taxi_grouped")
    sqlContext.cacheTable("taxi_grouped")
开发者ID:Narasimman,项目名称:Most-hapennning-places-NYC,代码行数:53,代码来源:parse_taxi.py

示例6: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def main(sc):

        sqlContext = SQLContext(sc)
        tasteProfileRdd = sc.textFile("userTaste/*")
	songRdd = sc.textFile("songsDict/*")
        # Load a text file and convert each line to a Row.
        tasteProfile = tasteProfileRdd.filter(lambda l:len(l) > 0)
        parsedSplits = tasteProfile.map(lambda l: l.split('\t'))
        userTaste = parsedSplits.map(lambda p: Row(userId=p[0], songId=p[1], playCount=p[2]))

	individualSong = songRdd.map(lambda l:l.split('|'))
        songData = individualSong.map(lambda s: Row(songId=s[0],featureSet=s[1]))

        # Infer the schema, and register the DataFrame as a table.
        schemaUserTaste = sqlContext.inferSchema(userTaste)
        schemaUserTaste.registerTempTable("userTaste")

	schemaSongData = sqlContext.inferSchema(songData)
        schemaSongData.registerTempTable("songData")

	test2 = sqlContext.sql("select * from songData limit 5")
	songIds = test2.map(lambda p: "songIds: " + s.songId)
        #test1 = sqlContext.sql("SELECT distinct * FROM userTaste limit 5")

        #songIds = test1.map(lambda p: "songIds: " + p.songId)
        for i in songIds.collect():
               print i
开发者ID:himaja20,项目名称:MusicRecommenderSystem,代码行数:29,代码来源:userPlayCountsSpark.py

示例7: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def main(argv):
    Conf = (SparkConf().setAppName("SimpleGraph"))
    sc = SparkContext(conf=Conf)
    sqlContext = SQLContext(sc)


    dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/data/"+argv[1]+".parquet"

    rawDF = sqlContext.read.parquet(dirPath).registerTempTable("comments")
    
    
    # This is where the magic happens
    # SQL self join to join users who have interacted with one another
    df = sqlContext.sql("""
    SELECT t1.subreddit as Subreddit,
       
       t1.id as OrigId ,                t2.id as RespId,
       t1.author AS OrigAuth,              t2.author AS RespAuth,
       t1.score  AS OrigScore,             t2.score  AS RespScore,
       t1.ups    AS OrigUps,               t2.ups    AS RespUps,
       t1.downs  AS OrigDowns,             t2.downs  AS RespDowns,
       t1.controversiality AS OrigControv, t2.controversiality AS RespControv
FROM comments t1 INNER JOIN comments t2 ON CONCAT("t1_",t1.id) = t2.parent_id where t1.author!='[deleted]' and t2.author!='[deleted]'
""")

    # write it into parquet ? Why ? Cause it compresses the data and is really fast to read from !
    df.write.parquet("hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/data/"+argv[1]+"-selfjoin.parquet")
开发者ID:Swebask,项目名称:RedditR--Insight-Data-Engineering-Project,代码行数:29,代码来源:self_joinDump.py

示例8: Spark_MapReduce_Parents

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def Spark_MapReduce_Parents(keyword, tokensofprevlevel, graphcache):
	#tokensofprevlevelkeyword=tokensofprevlevel
	#tokensofprevlevelkeyword.append(keyword)
	md5hashparents = hashlib.md5(keyword).hexdigest()

	#md5hashparents = keyword
	md5hashparents = md5hashparents + "$parents"

	picklef_keyword=open("RecursiveGlossOverlap_MapReduce_Parents_Persisted.txt","w")
	asfer_pickle_string_dump(keyword,picklef_keyword)
	picklef_keyword.close()
	cachevalue=graphcache.get(md5hashparents)
	if cachevalue:
		print "Spark_MapReduce_Parents(): hash = ", md5hashparents, "; returning from cache"
		return cachevalue 
	else:	
		#picklelock.acquire()
		spcon = SparkContext("local[2]","Spark_MapReduce_Parents")
		#picklef_keyword=open("RecursiveGlossOverlap_MapReduce_Parents_Persisted.txt","w")
		#asfer_pickle_string_dump(keyword,picklef_keyword)
		#picklef_keyword.close()
		paralleldata = spcon.parallelize(tokensofprevlevel).cache()
		#k=paralleldata.map(lambda keyword: mapFunction_Parents(keyword,tokensofprevlevel)).reduceByKey(reduceFunction_Parents)
		k=paralleldata.map(mapFunction_Parents).reduceByKey(reduceFunction_Parents)
		sqlContext=SQLContext(spcon)
		parents_schema=sqlContext.createDataFrame(k.collect())
		parents_schema.registerTempTable("Interview_RecursiveGlossOverlap_Parents")
		query_results=sqlContext.sql("SELECT * FROM Interview_RecursiveGlossOverlap_Parents")
		dict_query_results=dict(query_results.collect())
		#print "Spark_MapReduce_Parents() - SparkSQL DataFrame query results:"
		#picklelock.release()
		graphcache.set(md5hashparents,dict_query_results[1])
		spcon.stop()
		print "graphcache_mapreduce_parents updated:", graphcache
		return dict_query_results[1]
开发者ID:shrinivaasanka,项目名称:asfer-github-code,代码行数:37,代码来源:InterviewAlgorithmWithIntrinisicMerit_SparkMapReducer.py

示例9: log_mapreducer

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def log_mapreducer(logfilename, pattern, filt="None"):
        spcon=SparkContext()
	if filt == "None":
        	input=open(logfilename,'r')
        	paralleldata=spcon.parallelize(input.readlines())
        	patternlines=paralleldata.filter(lambda patternline: pattern in patternline)
		print "pattern lines",patternlines.collect()
        	matches=patternlines.map(mapFunction).reduceByKey(reduceFunction)
	else:
        	input=spcon.textFile(logfilename)
		matches=input.flatMap(lambda line:line.split()).filter(lambda line: filt in line).map(mapFunction).reduceByKey(reduceFunction)
        matches_collected=matches.collect()
	print "matches_collected:",matches_collected
	if len(matches_collected) > 0:
		sqlContext=SQLContext(spcon)
		bytes_stream_schema=sqlContext.createDataFrame(matches_collected)
		bytes_stream_schema.registerTempTable("USBWWAN_bytes_stream")
		query_results=sqlContext.sql("SELECT * FROM USBWWAN_bytes_stream")
		dict_query_results=dict(query_results.collect())
        	print "----------------------------------------------------------------------------------"
        	print "log_mapreducer(): pattern [",pattern,"] in [",logfilename,"] for filter [",filt,"]"
        	print "----------------------------------------------------------------------------------"
		dict_matches=dict(matches_collected)
		sorted_dict_matches = sorted(dict_matches.items(),key=operator.itemgetter(1), reverse=True)
        	print "pattern matching lines:",sorted_dict_matches 
        	print "----------------------------------------------------------------------------------"
		print "SparkSQL DataFrame query results:"
        	print "----------------------------------------------------------------------------------"
		pprint.pprint(dict_query_results)
        	print "----------------------------------------------------------------------------------"
		print "Cardinality of Stream Dataset:"
        	print "----------------------------------------------------------------------------------"
		print len(dict_query_results)
		spcon.stop()
        	return sorted_dict_matches 
开发者ID:shrinivaasanka,项目名称:usb-md-github-code,代码行数:37,代码来源:Spark_USBWWANLogMapReduceParser.py

示例10: Spark_MapReduce

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def Spark_MapReduce(level, wordsatthislevel, graphcache):
	freqterms1_local=wordsatthislevel
	md5hash = hashlib.md5(",".join(wordsatthislevel)).hexdigest()
	#md5hash = ",".join(wordsatthislevel)
	cachevalue=graphcache.get(md5hash)
	if cachevalue: 
		print "Spark_MapReduce(): hash = ", md5hash, "; returning from cache"
		return cachevalue 
	else:	
		spcon=SparkContext("local[2]","Spark_MapReduce")
		print "Spark_MapReduce(): wordsatthislevel:",wordsatthislevel
		paralleldata=spcon.parallelize(wordsatthislevel).cache()
		#k=paralleldata.map(lambda wordsatthislevel: mapFunction(wordsatthislevel)).reduceByKey(reduceFunction)
		k=paralleldata.map(mapFunction2).reduceByKey(reduceFunction)
		#k=paralleldata.map(mapFunction).reduceByKey(reduceFunction)

		#dict_k=k.collect()
		#s = sorted(dict_k.items(),key=operator.itemgetter(1), reverse=True)
		#print "Spark MapReduce results:"
		#print s
		############################
		sqlContext=SQLContext(spcon)
		recursiveglossoverlap_schema=sqlContext.createDataFrame(k.collect())
		recursiveglossoverlap_schema.registerTempTable("Interview_RecursiveGlossOverlap")
		query_results=sqlContext.sql("SELECT * FROM Interview_RecursiveGlossOverlap")
		dict_query_results=dict(query_results.collect())
		#print "Spark_MapReduce() - SparkSQL DataFrame query results:"
		#print dict_query_results[1]
		graphcache.set(md5hash, dict_query_results[1])
		print "graphcache_mapreduce updated:", graphcache
		spcon.stop()
		return dict_query_results[1]
开发者ID:shrinivaasanka,项目名称:asfer-github-code,代码行数:34,代码来源:InterviewAlgorithmWithIntrinisicMerit_SparkMapReducer.py

示例11: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def main():
    # Setting the cluster configuration parameters
    conf = SparkConf()
    conf.setMaster("spark://localhost:7077")
    conf.setAppName("Tweet App")
    conf.set("spark.executor.memory", "3g")
    conf.set("spark.driver.memory", "4g")

    # Creating a Spark Context with conf file
    sc = SparkContext(conf=conf)

    # Creating and SQL context to perform SQL queries
    sqlContext = SQLContext(sc)

    # Define the data path
    curr_path = os.path.dirname(os.path.abspath(__file__))
    json_name = "out.json"

    json_file_path = os.path.join(curr_path +
                                  "/../Spark_Jobs/data/",
                                  json_name)

    parquet_file_path = createSQLContext(json_file_path, sqlContext)
    print(parquet_file_path)

    # Read from parquet file
    parquetFile = sqlContext.read.parquet(parquet_file_path)
    parquetFile.registerTempTable("tweets")
    counter = sqlContext.sql("SELECT count(*) as cnt FROM tweets")
    print("============= Count =================")
    print("Count:: " + str(counter.collect()[0].cnt))
开发者ID:alt-code,项目名称:AutoSpark,代码行数:33,代码来源:tweet_scanner.py

示例12: run

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
 def run(self):
     sc = SparkContext("local", "gender")
     sqlContext = SQLContext(sc)
     #StringType =(str, unicode)
     _out = self.output().open('w')
     #lines = sc.textFile("myUser.csv")
     #fobj = self.input().open("r")
     #lines = sc.textFile(fobj.name)
     print(type(self.required_tasks['insert_source'].output()))
     print(self.required_tasks['insert_source'])
     #print(self.input()['insert_source'].input())
     lines = sc.textFile("myUser.csv")
     parts = lines.map(lambda l: l.split(","))
     users = parts.map(lambda p: (p[0], p[1],p[2],p[3],p[4],p[5],p[6],p[7],
         p[8],p[9],p[10],p[11],p[12],p[13],p[14],p[15],p[16],p[17],p[18],p[19]))
     schemaString = "userId lmsUserId lmsName orgName name gender registrationDate emailId mothertounge highestEduDegree goals city state active firstAccesDate lastAccessDate allowCert yearOfBirth pincode aadharId"
     print(schemaString)
     _out.write(schemaString )
     fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
     schema = StructType(fields)
     #schemaUser = sqlContext.createDataFrame(users, schema)
     schemaUser = sqlContext.applySchema(users, schema)
     schemaUser.registerTempTable("users")
     results = sqlContext.sql("SELECT gender FROM users")
     genders = results.map(lambda p : (p,1))
     counts = genders.reduceByKey(lambda a, b: a + b) #.map(lambda t : ("Gender " + t(0) + " No " + t(1))).collect()
     for name in counts.collect():
         _out.write(str(name))
     _out.close()
开发者ID:Zarana-Parekh,项目名称:analytics,代码行数:31,代码来源:genderTask.py

示例13: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def main(dataFile, outputPath):

    conf = SparkConf().setAppName("S3 Example").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    raw_text = sc.textFile(dataFile).persist(StorageLevel.MEMORY_AND_DISK)

    csv_data = raw_text.map(lambda l: l.split(","))
    row_data = csv_data.map(lambda p: dataIO.dataStruc(p))

    interaction_df = sqlContext.createDataFrame(row_data)

    # features.save_hdfs_parquet(interaction_df, outputPath)
    dataIO.save_hdfs_parquet(interaction_df, outputPath)

    interaction_df.registerTempTable("interactions")

    tcp_interactions = sqlContext.sql( """
        SELECT duration, dst_bytes, protocol_type FROM interactions WHERE protocol_type = 'tcp' AND duration > 1000 AND dst_bytes=0
    """)

    tcp_interactions.show()

    features.print_tcp_interactions(tcp_interactions)
    dataIO.print_from_dataio()
    features.print_from_feature()

    sc.stop()
开发者ID:yuantuo,项目名称:pysparkexample,代码行数:31,代码来源:example.py

示例14: get_latest_data

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
    def get_latest_data(self):
        from pyspark.sql import SparkSession
        import config
        import pandas as pd
        # initialise sparkContext
        spark1 = SparkSession.builder \
            .master(config.sp_master) \
            .appName(config.sp_appname) \
            .config('spark.executor.memory', config.sp_memory) \
            .config("spark.cores.max", config.sp_cores) \
            .getOrCreate()

        sc = spark1.sparkContext

        # using SQLContext to read parquet file
        from pyspark.sql import SQLContext
        sqlContext = SQLContext(sc)

        from datetime import datetime
        t1 = datetime.now()
        df = sqlContext.read.parquet(config.proj_path+'/datas/appid_datapoint_parquet1')
        # creating and querying fron the temporory table
        df1 = df.registerTempTable('dummy')
        df1 = sqlContext.sql('select count(distinct application) as app_count, time_stamp, source from dummy group by source, time_stamp')

        # data cleaning
        self.p2_df = df1.toPandas()
        
        dates_outlook = pd.to_datetime(pd.Series(self.p2_df.time_stamp),unit='ms')
        self.p2_df.index = dates_outlook   
        self.p2_df['date'] = self.p2_df.index.date
        self.p2_df = self.p2_df.sort_values(by='time_stamp')
    
        t2 =datetime.now()
        time_to_fetch = str(t2-t1)
开发者ID:abhoopathi,项目名称:friendly-lamp,代码行数:37,代码来源:p2_api.py

示例15: TestSQL

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
class TestSQL(PySparkTestCase):

    def setUp(self):
        PySparkTestCase.setUp(self)
        self.sqlCtx = SQLContext(self.sc)

    def test_basic_functions(self):
        rdd = self.sc.parallelize(['{"foo":"bar"}', '{"foo":"baz"}'])
        srdd = self.sqlCtx.jsonRDD(rdd)
        srdd.count()
        srdd.collect()
        srdd.schemaString()
        srdd.schema()

        # cache and checkpoint
        self.assertFalse(srdd.is_cached)
        srdd.persist(StorageLevel.MEMORY_ONLY_SER)
        srdd.unpersist()
        srdd.cache()
        self.assertTrue(srdd.is_cached)
        self.assertFalse(srdd.isCheckpointed())
        self.assertEqual(None, srdd.getCheckpointFile())

        srdd = srdd.coalesce(2, True)
        srdd = srdd.repartition(3)
        srdd = srdd.distinct()
        srdd.intersection(srdd)
        self.assertEqual(2, srdd.count())

        srdd.registerTempTable("temp")
        srdd = self.sqlCtx.sql("select foo from temp")
        srdd.count()
        srdd.collect()
开发者ID:zjmwqx,项目名称:spark-,代码行数:35,代码来源:tests.py


注:本文中的pyspark.sql.SQLContext.sql方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。