Python SparkContext.sequenceFile方法代码示例

本文整理汇总了Python中pyspark.SparkContext.sequenceFile方法的典型用法代码示例。如果您正苦于以下问题：Python SparkContext.sequenceFile方法的具体用法？Python SparkContext.sequenceFile怎么用？Python SparkContext.sequenceFile使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkContext的用法示例。

在下文中一共展示了SparkContext.sequenceFile方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: recom

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def recom(matrix_file_name, user_file_name, output="re.out"):
    sc = SparkContext("local[8]", "Recommendation")
    """ Reads in a sequence file FILE_NAME to be manipulated """
    matrix = sc.sequenceFile(matrix_file_name)
    user = sc.sequenceFile(user_file_name)

    """
    - flatMap takes in a function that will take one input and outputs 0 or more
      items
    - map takes in a function that will take one input and outputs a single item
    - reduceByKey takes in a function, groups the dataset by keys and aggregates
      the values of each key
    """
    user_tuples = user.flatMap(flat_user) \
                 .map(map_user) \
                 .sortByKey(keyfunc=lambda k: int(k))

    keys = user_tuples.keys().collect()

    matrix_tuples = matrix.flatMap(flat_matrix) \
                          .map(map_matrix) \
                          .filter(lambda x: x[0] in keys)
    global mt 
    mt = matrix_tuples.collectAsMap()

    recm = user_tuples.flatMap(flat_recom) \
                      .reduceByKey(reduce_recom) \
                      .filter(lambda x: x[0] not in keys) \
                      .sortBy(lambda (key, value): int(value))
 
    """ Takes the dataset stored in counts and writes everything out to OUTPUT """
    recm.coalesce(1).saveAsTextFile(output)

开发者ID:ShenghanGao，项目名称:Temp，代码行数:34，代码来源:recom.py

示例2: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def main(argv=None):
    '''this is called if run from command line'''

    parser = argparse.ArgumentParser()
    parser.add_argument('-i','--input', help="Required Seq input file on cluster.", required=True)
    args = parser.parse_args()

    sc = SparkContext()
    global goodJsonRecords, badJsonRecords, noPublisherRecords, noPublisherNameRecords
    goodJsonRecords = sc.accumulator(0)
    badJsonRecords = sc.accumulator(0)
    noPublisherRecords = sc.accumulator(0)
    noPublisherNameRecords = sc.accumulator(0)
    data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    keyCounts = data.values().flatMap(getKeys).countByValue()

    print "========================================"
    print "goodJsonRecords = %d" % goodJsonRecords.value
    print "badJsonRecords = %d" % badJsonRecords.value
    print "noPublisherRecords = %d" % noPublisherRecords.value
    print "noPublisherNameRecords = %d" % noPublisherNameRecords.value
    for k in sorted(keyCounts):
        print k, keyCounts[k]
    print "========================================"

    sc.stop()

开发者ID:cjsanjay，项目名称:dig-crf，代码行数:28，代码来源:countGoodKeysByPublisher.py

示例3: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def main(argv=None):
    '''this is called if run from command line'''

    parser = argparse.ArgumentParser()
    parser.add_argument('-i','--input', help="Seq input file on cluster.", required=True)
    parser.add_argument('-o','--output', help="UTF-8 output file on cluster.", required=False)
    parser.add_argument('-p','--printToLog', help="Print results to log.", required=False, action='store_true')
    args = parser.parse_args()

    sc = SparkContext()
    global goodJsonRecords, badJsonRecords
    goodJsonRecords = sc.accumulator(0)
    badJsonRecords = sc.accumulator(0)
    data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    tagCounts = data.values().flatMap(getTokens).countByValue()

    # So far, this code isn't useful.  The output fiile is written by the
    # master node into an isolated folder, and I don't know of a way to
    # retrieve it.
    if args.output != None:
        with codecs.open(args.output, 'wb', 'utf-8') as f:
            for k in sorted(tagCounts):
                f.write(k + " " + str(tagCounts[k]) + "\n")

    print "========================================"
    print "goodJsonRecords = %d" % goodJsonRecords.value
    print "badJsonRecords = %d" % badJsonRecords.value
    if args.printToLog:
        for k in sorted(tagCounts):
            print json.dumps(k), tagCounts[k]
    print "========================================"

开发者ID:cjsanjay，项目名称:dig-crf，代码行数:33，代码来源:countCrfResultTokens.py

示例4: user_artist_matrix

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def user_artist_matrix(file_name, output="user_artist_matrix.out"):
    sc = SparkContext("local[8]", "UserArtistMatrix")
    """ Reads in a sequence file FILE_NAME to be manipulated """
    file = sc.sequenceFile(file_name)

    """
    - flatMap takes in a function that will take one input and outputs 0 or more
      items
    - map takes in a function that will take one input and outputs a single item
    - reduceByKey takes in a function, groups the dataset by keys and aggregates
      the values of each key
    """
    ua_matrix = file.flatMap(ua_flat_doc) \
                 .map(ua_map) \
                 .reduceByKey(ua_reduce) \
                 .sortByKey(keyfunc=lambda k: int(k))

    ua_matrix = ua_matrix.flatMap(ua_flat_vec)

    global avg_matrix
    avg_matrix = ua_matrix.reduceByKey(ua_reduce_vec) \
                         .map(ua_map_avg)
    
    avg_matrix = avg_matrix.collectAsMap()

    co_matrix = ua_matrix.map(ua_map_cmp) \
                         .reduceByKey(ua_reduce_cmp) \
                         .map(ua_map_cmp_final)

    """ Takes the dataset stored in counts and writes everything out to OUTPUT """
    co_matrix.coalesce(1).saveAsTextFile(output)

开发者ID:Geurney，项目名称:Music-Recommendation，代码行数:33，代码来源:uac.py

示例5: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def main(argv=None):
    '''this is called if run from command line'''

    parser = argparse.ArgumentParser()
    parser.add_argument('-i','--input', help="Seq input file on cluster.", required=True)
    args = parser.parse_args()

    sc = SparkContext()
    global goodJsonRecords, badJsonRecords
    goodJsonRecords = sc.accumulator(0)
    badJsonRecords = sc.accumulator(0)
    data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    tagTokenCounts = data.values().flatMap(getTokens).countByValue()
    sc.stop()

    print "========================================"
    print "goodJsonRecords = %d" % goodJsonRecords.value
    print "badJsonRecords = %d" % badJsonRecords.value
    print "========================================"

    # Restructure the data, grouping by tag (token type indicator):
    tagTokenLists = {}
    for tagToken in tagTokenCounts.keys():
        (tag, tokenValue) = tagToken.split(":", 1)
        count = tagTokenCounts[tagToken]
        if tag not in tagTokenLists:
            tagTokenLists[tag] = []
        tagTokenLists[tag].append(Token(tokenValue, count))

    # Process each tag seperately:
    for tag in tagTokenLists.keys():
        tokenList = tagTokenLists[tag]

        # Sort the tokens by descending count and ascending token value:
        sortedTokenList = sorted(tokenList, key=attrgetter("value"))
        sortedTokenList = sorted(sortedTokenList, key=attrgetter("count"), reverse=True)

        # Calculate the cumulative token count for each token in sorted order:
        totalTokens = 0
        for token in sortedTokenList:
            totalTokens += token.count
            token.cumulativeCount = totalTokens

        # We'll use the final total later, but we need it as a float to ensure
        # floating point division is used:
        floatTotalTokens = float(totalTokens)

        # Print the sorted tokens with cumulative counts, fraction of
        # total (cunumative distribution function), and index
        # (enumerate the tokens per tag, starting with 1).
        print "========================================"
        tokenIndex = 0
        for token in sortedTokenList:
            tokenIndex += 1
            fractionOfTotal = token.cumulativeCount / floatTotalTokens
            print("{0:8d} {1:50} {2:10d} {3:10d} {4:.5f}".format(tokenIndex, json.dumps(tag + ": " + token.value),
                                                                 token.count, token.cumulativeCount, fractionOfTotal))
        print "========================================"

开发者ID:cjsanjay，项目名称:dig-crf，代码行数:60，代码来源:countCrfResultTokensFancy.py

示例6: docwordcount

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def docwordcount(file_name, output="spark-wc-out-docwordcount"):
    sc = SparkContext("local[8]", "DocWordCount")
    file = sc.sequenceFile(file_name)

    counts = file.flatMap(flat_map) \
                 .map(map) \
                 .reduceByKey(reduce)

    counts.coalesce(1).saveAsTextFile(output)

开发者ID:liamleahy，项目名称:cs61c，代码行数:11，代码来源:docwordcount.py

示例7: index

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def index(file_name, output="spark-wc-out-index"):
    sc = SparkContext("local[8]", "Index")
    file = sc.sequenceFile(file_name)

    indices = file.flatMap(flat_map) \
                  .map(map) \
                  .reduceByKey(reduce)

    indices.coalesce(1).saveAsTextFile(output)

开发者ID:cesarvh，项目名称:61C，代码行数:11，代码来源:index.py

示例8: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def main(argv=None):
    """this is called if run from command line"""

    parser = argparse.ArgumentParser()
    parser.add_argument("-e", "--excludeTags", help="Comma-separated list of tags to exclude.", required=False)
    parser.add_argument("--includeTags", help="Comma-separated list of tags to include.", required=False)
    parser.add_argument("-i", "--input", help="Seq or tuple input file.", required=True)
    parser.add_argument("--inputTuples", help="The input file is in tuple format.", required=False, action="store_true")
    parser.add_argument("-o", "--output", help="UTF-8 output file on cluster.", required=False)
    parser.add_argument("-p", "--printToLog", help="Print results to log.", required=False, action="store_true")
    args = parser.parse_args()

    if args.excludeTags and args.includeTags:
        print "Pick either --excludeTags or --includeTags, not both."
        return 1

    sc = SparkContext()

    global goodJsonRecords, badJsonRecords, excludedTagCount, includedTagCount, tokenCount
    goodJsonRecords = sc.accumulator(0)
    badJsonRecords = sc.accumulator(0)
    excludedTagCount = sc.accumulator(0)
    includedTagCount = sc.accumulator(0)
    tokenCount = sc.accumulator(0)

    if args.inputTuples:
        data = sc.textFile(args.input).map(lambda x: eval(x))
    else:
        data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    tagPhraseCounts = data.values().flatMap(getPhrasesMaker(args.includeTags, args.excludeTags)).countByValue()
    sc.stop()

    # So far, this code isn't useful.  The output fiile is written by the
    # master node into an isolated folder, and I don't know of a way to
    # retrieve it.
    if args.output != None:
        with codecs.open(args.output, "wb", "utf-8") as f:
            for k in sorted(tagPhraseCounts):
                f.write(k + " " + str(tagPhraseCounts[k]) + "\n")

    print "========================================"
    print "goodJsonRecords = %d" % goodJsonRecords.value
    print "badJsonRecords = %d" % badJsonRecords.value
    print "excludedTagCount = %d" % excludedTagCount.value
    print "includedTagCount = %d" % includedTagCount.value
    print "tokenCount = %d" % tokenCount.value
    if args.printToLog:
        for k in sorted(tagPhraseCounts):
            print json.dumps(k), tagPhraseCounts[k]
    print "========================================"

开发者ID:usc-isi-i2，项目名称:dig-crf，代码行数:52，代码来源:countCrfResultPhrases.py

示例9: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def main(argv=None):
    """this is called if run from command line"""

    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", help="Required Seq input file on cluster.", required=True)
    args = parser.parse_args()

    sc = SparkContext()
    data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    recordCount = data.count()

    print "========================================"
    print recordCount
    print "========================================"

    sc.stop()

开发者ID:cjsanjay，项目名称:dig-crf，代码行数:18，代码来源:countSeqRecords.py

示例10: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def main(argv=None):
    '''this is called if run from command line'''

    parser = argparse.ArgumentParser()
    parser.add_argument('-i','--input', help="Required Seq input file on cluster.", required=True)
    args = parser.parse_args()

    sc = SparkContext()
    data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    dataWithGoodJson = data.filter(goodJsonFilter)
    recordCount = dataWithGoodJson.count()

    print "========================================"
    print recordCount
    print "========================================"

    sc.stop()

开发者ID:cjsanjay，项目名称:dig-crf，代码行数:19，代码来源:countGoodJson.py

示例11: wordcount

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def wordcount(file_name, output="spark-wc-out-wordcount"):
    sc = SparkContext("local[8]", "WordCount")
    """ Reads in a sequence file FILE_NAME to be manipulated """
    file = sc.sequenceFile(file_name)

    """
    - flatMap takes in a function that will take one input and outputs 0 or more
      items
    - map takes in a function that will take one input and outputs a single item
    - reduceByKey takes in a function, groups the dataset by keys and aggregates
      the values of each key
    """
    counts = file.flatMap(flat_map) \
                 .map(map) \
                 .reduceByKey(reduce)

    """ Takes the dataset stored in counts and writes everything out to OUTPUT """
    counts.coalesce(1).saveAsTextFile(output)

开发者ID:Jananiravichandran，项目名称:cs61c，代码行数:20，代码来源:wordcount.py

示例12: artist_user_matrix

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def artist_user_matrix(file_name, output="artist_user_matrix.out"):
    sc = SparkContext("local[8]", "UserArtistMatrix")
    """ Reads in a sequence file FILE_NAME to be manipulated """
    file = sc.sequenceFile(file_name)

    """
    - flatMap takes in a function that will take one input and outputs 0 or more
      items
    - map takes in a function that will take one input and outputs a single item
    - reduceByKey takes in a function, groups the dataset by keys and aggregates
      the values of each key
    """
    counts = file.flatMap(flat_Map) \
                 .map(map) \
                 .reduceByKey(reduce) \
                 .sortByKey(keyfunc=lambda k: int(k))

    """ Takes the dataset stored in counts and writes everything out to OUTPUT """
    counts.map(lambda x: x[0] + ' ' + x[1]).coalesce(1).saveAsTextFile(output)

开发者ID:Geurney，项目名称:Music-Recommendation，代码行数:21，代码来源:artist_user_matrix.py

示例13: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def main(argv=None):
    """this is called if run from command line"""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-c", "--count", help="Optionally report a count of records extracted.", required=False, action="store_true"
    )
    parser.add_argument("-i", "--input", help="Required Seq input file on cluster.", required=True)
    parser.add_argument("-k", "--key", help="Required extraction key.", required=True)
    parser.add_argument(
        "-s", "--sample", type=int, default=0, help="Optionally print a sample of results.", required=False
    )
    args = parser.parse_args()

    extractionKey = args.key

    def extractValues(value):
        try:
            d = json.loads(value)
            if extractionKey in d:
                return iter([d[extractionKey]])
            else:
                return iter([])
        except:
            return iter([])

    sc = SparkContext()
    data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    extractedValuePairs = data.flatMapValues(extractValues)

    if args.count:
        recordCount = extractedValuePairs.count()
        print "========================================"
        print recordCount
        print "========================================"

    if args.sample > 0:
        sampleSet = extractedValuePairs.take(args.sample)
        print "========================================"
        for record in sampleSet:
            print record
        print "========================================"

开发者ID:cjsanjay，项目名称:dig-crf，代码行数:44，代码来源:extractField.py

示例14: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def main(argv):
    inputSequenceDir = ""
    outputSequenceDir = ""
    try:
        opts, args = getopt.getopt(argv,"i:o:")
    except getopt.GetoptError:
        sys.exit(2)
    for (opt,arg) in opts :
        if opt == '-i':
            inputSequenceDir = arg
        elif opt == '-o' :
            outputSequenceDir = arg
    sc = SparkContext(appName="Fix XML App")
    datarawRDD = sc.sequenceFile(inputSequenceDir)
    cleanedRDD = datarawRDD.map(lambda x : trim(x))
    
    outputFormatClassName = "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat"
    conf1= {"mapreduce.output.fileoutputformat.compress": "true", 
            "mapreduce.output.fileoutputformat.compress.codec":"org.apache.hadoop.io.compress.DefaultCodec",
            "mapreduce.output.fileoutputformat.compress.type":"RECORD"}
    cleanedRDD.saveAsNewAPIHadoopFile(outputSequenceDir,outputFormatClassName,"org.apache.hadoop.io.Text","org.apache.hadoop.io.Text",None,None,conf1)

    print "OK Bye Bye"

开发者ID:usc-isi-i2，项目名称:dig-prep，代码行数:25，代码来源:trim.py

示例15: filter_and_split

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
        #create hashes and reduce by key
        dict = document_terms.flatMap(lambda terms: [(t, self.indexOf(t)) for t in terms]).reduceByKey(lambda a, b: a)
        return dict

def filter_and_split(text):
    delims = u"\r\n\t.,;:'\"()?!$#-0123456789/*%<>@[]+`~_=&^ "
    translate_table = dict((ord(char), u" ") for char in delims)
    return text.lower().strip().translate(translate_table).split(" ")


# init the spark context
if "sc" not in globals():
    sc = SparkContext( appName="TF-IDF")

# Load documents (one per line).
documents = sc.sequenceFile(docs_dir).map(lambda (fname, content): filter_and_split(content))
documents.cache()

# # keep only the content (replace, lower, split, etc)
# documents = documents.

hashingTF = myHashingTF()


# create the tf vectors
tf = hashingTF.transform(documents)
# create the idf vectors
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
#save
tfidf.saveAsTextFile(d_out)

开发者ID:cmantas，项目名称:asap.cslab，代码行数:33，代码来源:spark_tfidf_v2.py

注：本文中的pyspark.SparkContext.sequenceFile方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。