当前位置: 首页>>代码示例>>Python>>正文


Python SQLContext.createDataFrame方法代码示例

本文整理汇总了Python中pyspark.SQLContext.createDataFrame方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.createDataFrame方法的具体用法?Python SQLContext.createDataFrame怎么用?Python SQLContext.createDataFrame使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.SQLContext的用法示例。


在下文中一共展示了SQLContext.createDataFrame方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: hash_rating

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import createDataFrame [as 别名]
def hash_rating(author_subreddit_rating_rdd, sc):
    sql_context = SQLContext(sc)

    author_sub_schema = StructType([
        StructField("author", StringType(), True),
        StructField("subreddit", StringType(), True),
        StructField("rating", LongType(), True)
    ])
    asr_df = sql_context.createDataFrame(author_subreddit_rating_rdd, author_sub_schema)

    author_rdd = author_subreddit_rating_rdd.map(lambda (a, s, r): a)
    aid_rdd = author_rdd.distinct().zipWithUniqueId().cache()
    author_id_schema = StructType([
        StructField("author", StringType(), True),
        StructField("author_id", LongType(), True)
    ])
    aid_df = sql_context.createDataFrame(aid_rdd, author_id_schema)
    aid_s_r_df = aid_df.join(asr_df, on='author').drop('author').cache()

    subreddit_rdd = author_subreddit_rating_rdd.map(lambda (a, s, r): s)
    sid_rdd = subreddit_rdd.distinct().zipWithUniqueId().cache()
    subreddit_id_schema = StructType([
        StructField("subreddit", StringType(), True),
        StructField("subreddit_id", LongType(), True)
    ])
    sid_df = sql_context.createDataFrame(sid_rdd, subreddit_id_schema)
    aid_sid_r_df = sid_df.join(aid_s_r_df, on='subreddit').drop('subreddit').cache()
    row_aid_sid_r_rdd = aid_sid_r_df.rdd
    aid_sid_r_rdd = row_aid_sid_r_rdd.map(lambda row: (row.author_id, row.subreddit_id, row.rating))

    return aid_rdd, sid_rdd, aid_sid_r_rdd
开发者ID:wmaciel,项目名称:redditDataAnalysis,代码行数:33,代码来源:sub_recommender.py

示例2: _get_data

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import createDataFrame [as 别名]
 def _get_data(self):
     sql_context = SQLContext(self.sc)
     l = [
         (
         "I dont know why people think this is such a bad movie.",
         Vectors.sparse(3, {1: 1.0, 2: 1.0, 3: 1.0})
         ),
     ]
     return sql_context.createDataFrame(l, ['text', 'features'])
开发者ID:ngarneau,项目名称:sentiment-analysis,代码行数:11,代码来源:transformers.py

示例3: _get_train_data

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import createDataFrame [as 别名]
 def _get_train_data(self):
     sql_context = SQLContext(self.sc)
     l = [
         (1, Vectors.dense([1, 2, 3]), 1.0),
         (2, Vectors.dense([1, 2, 3]), 0.0),
         (3, Vectors.dense([1, 2, 3]), 1.0),
         (4, Vectors.dense([1, 2, 3]), 0.0),
     ]
     return sql_context.createDataFrame(l, ['id', 'features', 'label'])
开发者ID:ngarneau,项目名称:sentiment-analysis,代码行数:11,代码来源:pipelines.py

示例4: print

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import createDataFrame [as 别名]
mirror_dir = "data/mirror"
data_dir = "data/data-{0}".format(dataset_date)
out_dir = "data/bhl-{0}.parquet".format(dataset_date)

if os.path.isdir(out_dir):	
    print("Output dir {0} exists".format(out_dir))
    exit


get_ocr_udf = sql.udf(get_ocr, types.StringType())
fn = os.path.join(data_dir, "item.txt")

# Optional limit for testing, add this to the chain as second step
# .sample(withReplacement=False, fraction=0.001) \
sqlContext.createDataFrame(t_gen(fn, type_data_item), schema_item()) \
    .withColumn("ocrtext", get_ocr_udf(sql.col("barcode"))) \
    .write.parquet(out_dir)


# Example run on Elk (16 thread single machine)
#real    84m21.818s
#user    198m57.612s
#sys     15m19.662s

# Example run on okapi (128 thread single machine)
#real    41m13.984s
#user    482m34.084s
#sys     278m12.404s

开发者ID:bio-guoda,项目名称:guoda-datasets,代码行数:30,代码来源:build_parquet.py

示例5: main

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import createDataFrame [as 别名]
def main():
    conf = SparkConf().setAppName("climate")
    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)
    climateSchema = StructType(
        [
            StructField("station", StringType(), False),
            StructField("date", IntegerType(), False),
            StructField("element", StringType(), False),
            StructField("value", IntegerType(), True),
            StructField("mflag", StringType(), True),
            StructField("qflag", StringType(), True),
            StructField("sflag", StringType(), True),
            StructField("obstime", StringType(), True),
        ]
    )
    info = sqlContext.read.format("com.databricks.spark.csv").options(header="false").schema(climateSchema).load(inputs)
    info.registerTempTable("info")
    stationinfo = sqlContext.sql("SELECT station, date, element, value, FLOOR(date/10000) as yy FROM info ")
    stationinfo.registerTempTable("stationinfo")
    stationinfo.cache()

    prcpTable = sqlContext.sql("SELECT station, date, value as prcp, yy FROM stationinfo WHERE element='PRCP' ")
    prcpTable.registerTempTable("prcpTable")
    prcpTable.cache()
    # prcpTable.show()

    # create 3 tables that hold the monthly average of min, max temperature and prcp
    yearlyprcp = sqlContext.sql(
        "SELECT station, yy, ROUND(Avg(prcp),0) as avg_prcp FROM prcpTable GROUP BY station, yy "
    )
    yearlyprcp.registerTempTable("prcpMean")
    # yearlyprcp.show()

    # get information about stations from stations.txt

    def getdata(line):
        line = line.split("  ")
        values = [x.strip() for x in line]
        return values

    stations = sc.textFile(input2)
    stations = stations.map(getdata)
    stations = stations.map(lambda (a, b, c): Row(station=a, latitude=float(b), longitude=float(c))).cache()
    stationDF = sqlContext.createDataFrame(stations)
    stationDF.registerTempTable("StationTable")
    stationDF.cache()

    # param = sqlContext.sql("SELECT MAX(latitude) as max_lat, Min(latitude) as min_lat, MAX(longitude) as max_long, Min(longitude) as min_long FROM StationTable")
    # param.show()

    # Join to station file to add latitude and longitude and stationID
    result = (
        stationDF.join(yearlyprcp)
        .where(stationDF.station == yearlyprcp.station)
        .select(yearlyprcp.avg_prcp, yearlyprcp.station, yearlyprcp.yy, stationDF.latitude, stationDF.longitude)
    )

    # save into parquet file
    result.write.format("parquet").save(output)
开发者ID:sasoltan,项目名称:DroughtPercipitation,代码行数:63,代码来源:climateweather.py

示例6: features_to_vec

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import createDataFrame [as 别名]
def features_to_vec(length, entropy, alexa_grams, word_grams):
    high_entropy = 0.0
    high_length = 0.0
    if entropy > 3.5: high_entropy = 1.0
    if length > 30: high_length = 1.0
    return Vectors.dense(length, entropy, high_entropy, high_length, alexa_grams, word_grams)


#dga_domains = sc.textFile("/user/cloudera/dga.txt")
#dga_domains = dga_domains.map(lambda x: (x, "dga", float(len(x)), entropy(x)))
#dga_domains_df = sqlctx.createDataFrame(dga_domains, schema).dropna().distinct().cache()

words = sc.textFile("/user/cloudera/words.txt")
words = words.map(lambda x: (x, "dict", float(len(x)), entropy(x)))
words_df = sqlctx.createDataFrame(words, schema).dropna().distinct().cache()

dga_domains = sc.textFile("/user/cloudera/c_domains_*")
dga_domains = dga_domains.map(lambda x: (x, "dga", float(len(x)), entropy(x)))
dga_domains_df = sqlctx.createDataFrame(dga_domains, schema).dropna().distinct().cache()

alexa_domains = sqlctx.read.format('com.databricks.spark.csv').options(header='false', inferschema='true').load(
    'alexa_100k.csv')\
    .map(lambda x: (x[1], "legit", float(len(x[1])), entropy(x[1])))
alexa_domains_df = sqlctx.createDataFrame(alexa_domains, schema).dropna().distinct().cache()

alexa_domains_1M = sqlctx.read.format('com.databricks.spark.csv').options(header='false', inferschema='true').load(
    'alexa_1M.csv')\
    .map(lambda x: (x[1], "legit", float(len(x[1])), entropy(x[1])))
alexa_domains_1M = sqlctx.createDataFrame(alexa_domains_1M, schema).distinct().cache()
开发者ID:jleaniz,项目名称:bdsa,代码行数:31,代码来源:dga_detect.py

示例7: StructField

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import createDataFrame [as 别名]
    # uid,adid,guuid,createtime
    fields = [
        StructField('uid', StringType(), True),
        StructField('adid', StringType(), True),
        StructField('guuid', StringType(), True),
        StructField('guuidctime', LongType(), True),
        StructField('url', StringType(), True),

        StructField('referer', StringType(), True),
        StructField('hosid', StringType(), True),
        StructField('gwid', StringType(), True),
        StructField('ua', StringType(), True),
        StructField('ip', StringType(), True),

        StructField('createtime', LongType(), True),

    ]

    schema = StructType(fields)

    # [(),()] ['','']
    df_dest = sqlContext.createDataFrame(rdd, schema)
    df_dest.registerTempTable("back_portal_loginlog")

    #df_dest.rdd.foreach(my_print)
    # save
    df_dest.write.parquet(output)


    sc.stop()
开发者ID:wangcunxin,项目名称:spark_py,代码行数:32,代码来源:transfer_advload.py

示例8: str

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import createDataFrame [as 别名]
    nn_gridsearch.debug('-'*40)
    nn_gridsearch.debug('Execution time: %s' % str(datetime.now()))

    # with open('~/.aws/credentials.json') as f:
    #     CREDENTIALS = json.load(f)

    sc = set_spark_context()

    conn = S3Connection()
    sqc = SQLContext(sc)
    sm = SparkModel(sc, conn, rdd_path='rdd.pkl')


    bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow)) \
            .sample(withReplacement=False, fraction=.5, seed=1)
    df = sqc.createDataFrame(bow_rdd, ['string_label', 'raw'])
    train_rdd, test_rdd = df.randomSplit([.8, .2], seed=1)
    results = []

    num_features = 5000
    min_doc_freq = 20
    layers = [[5000, 2056, 512, 128, 2], [5000, 1000, 128, 2], [5000, 100, 2], [5000, 5000, 2]]

    for l in layers:
        remover = StopWordsRemover(inputCol="raw", outputCol="words")
        hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts",
                              numFeatures=num_features)
        tfidf = IDF(inputCol=hashingTF.getOutputCol(),
                    outputCol="features", minDocFreq=min_doc_freq)
        indexer = StringIndexer(inputCol="string_label", outputCol="label")
开发者ID:Nathx,项目名称:parental_advisory_ml,代码行数:32,代码来源:nn_grid_search.py

示例9: StructField

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import createDataFrame [as 别名]
    fields = [
        StructField('logintype', StringType(), True),
        StructField('logtype', StringType(), True),
        StructField('hosid', StringType(), True),
        StructField('suppid', StringType(), True),
        StructField('logtime', LongType(), True),
        StructField('usermac', StringType(), True)
    ]
    schema = StructType(fields)

    rdd1 = rdd.map(convert_logtype).filter(lambda tup: tup != None)
    # rdd1.foreach(printx)
    # sc.stop()

    ret_df = sqlContext.createDataFrame(rdd1, schema)
    ret_df.registerTempTable("loginflowlog_overall")
    _sql = "SELECT count(usermac) pv,count(distinct usermac) uv,logtype " \
           "from loginflowlog_overall " \
           "group by logtype"
    rs_df = sqlContext.sql(_sql)

    service = LoginflowlogMysqlService()
    ret_overall_list = service.getRetOverall(rs_df.collect(), day)
    _sql_delete = "delete from login_flow_global_count where date ='%s'" % day
    _sql_insert = "insert into login_flow_global_count(date," \
                  "prelogin_num,prelogin_pnum,login_num,login_pnum," \
                  "login_click_num,login_click_pnum,forward_num,forward_pnum," \
                  "preArrive_num,preArrive_pnum,arrive_num,arrive_pnum) " \
                  "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    service.write_mysql(ret_overall_list, _sql_delete, _sql_insert)
开发者ID:wangcunxin,项目名称:spark_py,代码行数:32,代码来源:loginflowlog2mysql_update.py

示例10: parsePoint

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import createDataFrame [as 别名]
# Load and parse the data
# line format: (station, latitude, longitude,)


def parsePoint(line):
    return LabeledPoint(line[0], line[1:])

# read data from station file
def  getdata(line):
        line = line.split('  ')
        values = [x.strip() for x in line]
        return values
stations = sc.textFile(input)
stations = stations.map(getdata)
stations = stations.map(lambda (a,b,c): (float(hash(a)), int(year), float(b), float(c))).cache()
stationsDF = sqlContext.createDataFrame(stations)

# create dataset to fit into model
parseData = stations.map(parsePoint)

# load the model
sameModel = LinearRegressionModel.load(sc, myModelPath)

# run the model
stationidAndPreds = parseData.map(lambda p : (p.label,  float(sameModel.predict(p.features))))
stationidAndPredsDF = sqlContext.createDataFrame(stationidAndPreds)

# the result returns a predicted value for each station (stationId) in the given year
# joining the stations rdd with stationidAndPreds to find the latitude and longitude of each station
result = stationsDF.join(stationidAndPredsDF).where(stationidAndPredsDF[0]==stationsDF[0]).select(stationidAndPredsDF[1], stationsDF[2], stationsDF[3])
开发者ID:sasoltan,项目名称:DroughtPercipitation,代码行数:32,代码来源:yearPrediction.py

示例11: SparkContext

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import createDataFrame [as 别名]
"""
from pyspark import SparkContext, SQLContext
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="PipelineExample")
    sqlContext = SQLContext(sc)

    # $example on$
    # Prepare training documents from a list of (id, text, label) tuples.
    training = sqlContext.createDataFrame([
        (0L, "a b c d e spark", 1.0),
        (1L, "b d", 0.0),
        (2L, "spark f g h", 1.0),
        (3L, "hadoop mapreduce", 0.0)], ["id", "text", "label"])

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # Fit the pipeline to training documents.
    model = pipeline.fit(training)

    # Prepare test documents, which are unlabeled (id, text) tuples.
    test = sqlContext.createDataFrame([
开发者ID:0xqq,项目名称:spark,代码行数:34,代码来源:pipeline_example.py

示例12: SparkConf

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import createDataFrame [as 别名]
parser.add_argument('deaths')
parser.add_argument('output')
args = parser.parse_args()

conf = SparkConf().setAppName("correlate")
sc = SparkContext(conf=conf)
sql = SQLContext(sc)

births_raw = sql.read.load(args.births).rdd
deaths_raw = sql.read.load(args.deaths).rdd

births = births_raw.map(to_joinable_on_id)
deaths = deaths_raw.map(to_joinable_on_id)

both = births.fullOuterJoin(deaths)
unjoined_births = both.filter(get_unjoined_births)
unjoined_deaths = both.filter(get_unjoined_deaths)
correctly_joined = both.filter(remove_unjoined_all).map(to_joined_format)

# do a join with jaro-winkler
jaro_input_births = unjoined_births.map(to_jaro_matching_input)
jaro_input_deaths = unjoined_deaths.map(to_jaro_matching_input)
jaro_input_all = jaro_input_births.cartesian(jaro_input_deaths)
jaro_joined = jaro_input_all.filter(jaro_match).map(cart_to_joined_format)

to_save = sql.createDataFrame(correctly_joined)
to_save.write.save(args.output + '/joined', format="parquet")

to_save = sql.createDataFrame(jaro_joined)
to_save.write.save(args.output + '/jaro_joined', format="parquet")
开发者ID:nevermore0,项目名称:MDP-Cloud-Winter-2017,代码行数:32,代码来源:correlate.py

示例13: SparkContext

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import createDataFrame [as 别名]
            .setAppName("adhoscount")
            .set("spark.kryoserializer.buffer.mb", "256")
            .set("spark.sql.parquet.binaryAsString","true")
            )
    sc = SparkContext(conf = conf)
    sqlContext = SQLContext(sc)

    _adloadDF=sqlContext.read.parquet(adLoadFiles)
    _adloadRdd=_adloadDF.rdd.map(lambda x:(x.guuid,x.hosid)).groupByKey().map(fetchOne)

    fields = [
        StructField('guuid', StringType(), True),
        StructField('hosid', StringType(), True),
        ]
    schema = StructType(fields)
    schemaDest = sqlContext.createDataFrame(_adloadRdd, schema)
    schemaDest.registerTempTable("ghid")

    _adloadDF.registerAsTable("adload")
    sqlContext.read.parquet(adPlayFiles).registerAsTable("adplay")
    sqlContext.read.parquet(adClickFiles).registerAsTable("adclick")

    '''
    _adLoadDF=sqlContext.createDataFrame([
        {'uid': '1', 'adid': 'a','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823568766},
        {'uid': '2', 'adid': 'b','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823569766},
        {'uid': '3', 'adid': 'c','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823550766},
        {'uid': '4', 'adid': 'd','guuid':'bb','guuidctime':1,'url':'','referer':'','hosid':'133','gwid':'','ua':'','ip':'','createtime':1450823268766},
    ]).registerAsTable("adload")
    _adPlayDF=sqlContext.createDataFrame([
        {'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823568766},
开发者ID:wangcunxin,项目名称:spark_py,代码行数:33,代码来源:adcount.py

示例14: main

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import createDataFrame [as 别名]
def main(argv):
    # list of words to look for!
    GODWINS_WORDS = ['hitler', 'nazi']

    # setup inputs and outputs
    input_directory = argv[0]
    output_directory = argv[1]

    # spark specific setup
    conf = SparkConf().setAppName('godwin whaaa')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # read input
    text = sc.textFile(input_directory)
    text = text.repartition(200)

    # convert to magic json formatting
    loadedJson = text.map(lambda line: json.loads(line))

    # make the json skinnier by removing unwanted stuff
    fullRedditJson = loadedJson.map(lambda jObj: (jObj['body'], jObj['name'], jObj['parent_id'])).cache()

    # code from greg for regex to parse lines
    linere = re.compile(regex_from_words(GODWINS_WORDS))

    # now filter out stuff without GODWINS_WORDS "body","id", "subreddit", "parent_id" 
    godwinJsonList = fullRedditJson.filter(lambda (body, name, parent_id): linere.match(body.lower()))
    
    # We don't need the comment body anymore...
    # We need to find the paths now...
    godwin_node_rdd = godwinJsonList.map(row_into_node).cache()
    full_node_rdd = fullRedditJson.map(row_into_node)

    # we also need a list of node names so we can later check if we already visited it.
    godwinNodes = godwin_node_rdd.map(lambda (name, parent_id): name)

    # Convert full data RDD into SQL Data Frame
    subredditSchema = StructType([
        StructField("name", StringType(), True),
        StructField("parent_id", StringType(), True)
    ])
    full_node_df = sqlContext.createDataFrame(full_node_rdd, subredditSchema)

    # Convert godwin rows RDD into SQL Data Frame
    godwinSchema = StructType([
        StructField("g_name", StringType(), True),
        StructField("g_parent_id", StringType(), True)
    ])
    godwin_node_df = sqlContext.createDataFrame(godwin_node_rdd, godwinSchema).cache()

    count_down = godwin_node_df.count()
    print 'There are', count_down, 'comments with a godwins word'
    depth = 0
    nodes_per_depth = {}
    visited_node_list_df = godwin_node_df.select(godwin_node_df.g_name)
    print 'visited_node_list_df'
    print str(visited_node_list_df.count())
    
    while count_down > 0 and depth < 100:

        depth += 1
        # Join find next layer of nodes
        joined_df = godwin_node_df.join(full_node_df,
                                        [godwin_node_df['g_parent_id'] == full_node_df['name']])
        
        # Drop the columns of the older node
        next_node_df = joined_df.select(
            joined_df['name'].alias('g_name'),
            joined_df['parent_id'].alias('g_parent_id')).cache()
        print 'next_node_df count: '+str(next_node_df.count())
        
        # Select only the ones that have NOT been visited
        # TODO: is there a better way?
        leftt = next_node_df.join(visited_node_list_df, next_node_df.g_name == visited_node_list_df.g_name, 'left')
        next_node_df = leftt.select(next_node_df.g_name, next_node_df.g_parent_id, visited_node_list_df.g_name.alias('dup'))
        next_node_df = next_node_df.fillna({'dup':'xxxxxx'})
        next_node_df = next_node_df.filter(next_node_df.dup == 'xxxxxx')
        next_node_df = next_node_df.drop(next_node_df.dup)


        # add the g_name to the list of visited nodes 
        # TODO: make more efficient!
        visited_df = next_node_df.select(next_node_df.g_name)
        visited_node_list_df = visited_node_list_df.unionAll(visited_df)
        visited_node_list_df = visited_node_list_df.dropDuplicates()
        
        count_up = next_node_df.count()
        n_nodes = count_down - count_up
        print 'number of godwin nodes of heignt', depth, '=', n_nodes
        nodes_per_depth[depth] = n_nodes
        count_down = count_up

        godwin_node_df = next_node_df

    avg = compute_average_godwin(nodes_per_depth)
    print 'The average distance to the godwin words is', avg

    fp = open(output_directory + 'average.txt')
    fp.write(str(avg) + '\n')
#.........这里部分代码省略.........
开发者ID:wmaciel,项目名称:redditDataAnalysis,代码行数:103,代码来源:godwin.py

示例15: StructField

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import createDataFrame [as 别名]
    #(u'2015-48_6C25B958F2CC_175', u'2015120120')
    #rdd1.foreach(my_print)
    #(u'2015-50_7014A62FA5B0_0', [u'22',u'23'])
    rdd1_2 = rdd1_1.groupByKey().mapValues(list).sortByKey().map(times_count_first)
    #(u'2015-48_903C920CAE97_655', [u'15_1'])
    #rdd1_2.foreach(my_print)

    rdd2_1 = df.rdd.map(convert_kv_last)
    rdd2_2 = rdd2_1.groupByKey().mapValues(list).sortByKey().map(times_count_last)

    rdd3 = rdd1_2.join(rdd2_2).map(convert_rets).values().flatMap(list)
    #(u'2015', u'48', u'A09347EC9FBB', u'189', u'13', u'1', u'14', u'1')
    rdd3.foreach(my_print)
    logger.info(rdd3.count())
    fields = [
        StructField('year', StringType(), True),
        StructField('week', StringType(), True),
        StructField('mac', StringType(), True),
        StructField('hosid', StringType(), True),
        StructField('firstTime', StringType(), True),
        StructField('firstCount', LongType(), True),
        StructField('lastTime', StringType(), True),
        StructField('lastCount', LongType(), True)
    ]
    schema = StructType(fields)

    df1 =  sqlContext.createDataFrame(rdd3,schema)
    df1.coalesce(2).write.parquet(output,'overwrite')


    sc.stop()
开发者ID:wangcunxin,项目名称:spark_py,代码行数:33,代码来源:user_visittimes_month.py


注:本文中的pyspark.SQLContext.createDataFrame方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。