本文整理汇总了Python中pyspark.SparkContext.sequenceFile方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.sequenceFile方法的具体用法?Python SparkContext.sequenceFile怎么用?Python SparkContext.sequenceFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkContext
的用法示例。
在下文中一共展示了SparkContext.sequenceFile方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: recom
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def recom(matrix_file_name, user_file_name, output="re.out"):
sc = SparkContext("local[8]", "Recommendation")
""" Reads in a sequence file FILE_NAME to be manipulated """
matrix = sc.sequenceFile(matrix_file_name)
user = sc.sequenceFile(user_file_name)
"""
- flatMap takes in a function that will take one input and outputs 0 or more
items
- map takes in a function that will take one input and outputs a single item
- reduceByKey takes in a function, groups the dataset by keys and aggregates
the values of each key
"""
user_tuples = user.flatMap(flat_user) \
.map(map_user) \
.sortByKey(keyfunc=lambda k: int(k))
keys = user_tuples.keys().collect()
matrix_tuples = matrix.flatMap(flat_matrix) \
.map(map_matrix) \
.filter(lambda x: x[0] in keys)
global mt
mt = matrix_tuples.collectAsMap()
recm = user_tuples.flatMap(flat_recom) \
.reduceByKey(reduce_recom) \
.filter(lambda x: x[0] not in keys) \
.sortBy(lambda (key, value): int(value))
""" Takes the dataset stored in counts and writes everything out to OUTPUT """
recm.coalesce(1).saveAsTextFile(output)
示例2: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def main(argv=None):
'''this is called if run from command line'''
parser = argparse.ArgumentParser()
parser.add_argument('-i','--input', help="Required Seq input file on cluster.", required=True)
args = parser.parse_args()
sc = SparkContext()
global goodJsonRecords, badJsonRecords, noPublisherRecords, noPublisherNameRecords
goodJsonRecords = sc.accumulator(0)
badJsonRecords = sc.accumulator(0)
noPublisherRecords = sc.accumulator(0)
noPublisherNameRecords = sc.accumulator(0)
data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
keyCounts = data.values().flatMap(getKeys).countByValue()
print "========================================"
print "goodJsonRecords = %d" % goodJsonRecords.value
print "badJsonRecords = %d" % badJsonRecords.value
print "noPublisherRecords = %d" % noPublisherRecords.value
print "noPublisherNameRecords = %d" % noPublisherNameRecords.value
for k in sorted(keyCounts):
print k, keyCounts[k]
print "========================================"
sc.stop()
示例3: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def main(argv=None):
'''this is called if run from command line'''
parser = argparse.ArgumentParser()
parser.add_argument('-i','--input', help="Seq input file on cluster.", required=True)
parser.add_argument('-o','--output', help="UTF-8 output file on cluster.", required=False)
parser.add_argument('-p','--printToLog', help="Print results to log.", required=False, action='store_true')
args = parser.parse_args()
sc = SparkContext()
global goodJsonRecords, badJsonRecords
goodJsonRecords = sc.accumulator(0)
badJsonRecords = sc.accumulator(0)
data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
tagCounts = data.values().flatMap(getTokens).countByValue()
# So far, this code isn't useful. The output fiile is written by the
# master node into an isolated folder, and I don't know of a way to
# retrieve it.
if args.output != None:
with codecs.open(args.output, 'wb', 'utf-8') as f:
for k in sorted(tagCounts):
f.write(k + " " + str(tagCounts[k]) + "\n")
print "========================================"
print "goodJsonRecords = %d" % goodJsonRecords.value
print "badJsonRecords = %d" % badJsonRecords.value
if args.printToLog:
for k in sorted(tagCounts):
print json.dumps(k), tagCounts[k]
print "========================================"
示例4: user_artist_matrix
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def user_artist_matrix(file_name, output="user_artist_matrix.out"):
sc = SparkContext("local[8]", "UserArtistMatrix")
""" Reads in a sequence file FILE_NAME to be manipulated """
file = sc.sequenceFile(file_name)
"""
- flatMap takes in a function that will take one input and outputs 0 or more
items
- map takes in a function that will take one input and outputs a single item
- reduceByKey takes in a function, groups the dataset by keys and aggregates
the values of each key
"""
ua_matrix = file.flatMap(ua_flat_doc) \
.map(ua_map) \
.reduceByKey(ua_reduce) \
.sortByKey(keyfunc=lambda k: int(k))
ua_matrix = ua_matrix.flatMap(ua_flat_vec)
global avg_matrix
avg_matrix = ua_matrix.reduceByKey(ua_reduce_vec) \
.map(ua_map_avg)
avg_matrix = avg_matrix.collectAsMap()
co_matrix = ua_matrix.map(ua_map_cmp) \
.reduceByKey(ua_reduce_cmp) \
.map(ua_map_cmp_final)
""" Takes the dataset stored in counts and writes everything out to OUTPUT """
co_matrix.coalesce(1).saveAsTextFile(output)
示例5: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def main(argv=None):
'''this is called if run from command line'''
parser = argparse.ArgumentParser()
parser.add_argument('-i','--input', help="Seq input file on cluster.", required=True)
args = parser.parse_args()
sc = SparkContext()
global goodJsonRecords, badJsonRecords
goodJsonRecords = sc.accumulator(0)
badJsonRecords = sc.accumulator(0)
data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
tagTokenCounts = data.values().flatMap(getTokens).countByValue()
sc.stop()
print "========================================"
print "goodJsonRecords = %d" % goodJsonRecords.value
print "badJsonRecords = %d" % badJsonRecords.value
print "========================================"
# Restructure the data, grouping by tag (token type indicator):
tagTokenLists = {}
for tagToken in tagTokenCounts.keys():
(tag, tokenValue) = tagToken.split(":", 1)
count = tagTokenCounts[tagToken]
if tag not in tagTokenLists:
tagTokenLists[tag] = []
tagTokenLists[tag].append(Token(tokenValue, count))
# Process each tag seperately:
for tag in tagTokenLists.keys():
tokenList = tagTokenLists[tag]
# Sort the tokens by descending count and ascending token value:
sortedTokenList = sorted(tokenList, key=attrgetter("value"))
sortedTokenList = sorted(sortedTokenList, key=attrgetter("count"), reverse=True)
# Calculate the cumulative token count for each token in sorted order:
totalTokens = 0
for token in sortedTokenList:
totalTokens += token.count
token.cumulativeCount = totalTokens
# We'll use the final total later, but we need it as a float to ensure
# floating point division is used:
floatTotalTokens = float(totalTokens)
# Print the sorted tokens with cumulative counts, fraction of
# total (cunumative distribution function), and index
# (enumerate the tokens per tag, starting with 1).
print "========================================"
tokenIndex = 0
for token in sortedTokenList:
tokenIndex += 1
fractionOfTotal = token.cumulativeCount / floatTotalTokens
print("{0:8d} {1:50} {2:10d} {3:10d} {4:.5f}".format(tokenIndex, json.dumps(tag + ": " + token.value),
token.count, token.cumulativeCount, fractionOfTotal))
print "========================================"
示例6: docwordcount
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def docwordcount(file_name, output="spark-wc-out-docwordcount"):
sc = SparkContext("local[8]", "DocWordCount")
file = sc.sequenceFile(file_name)
counts = file.flatMap(flat_map) \
.map(map) \
.reduceByKey(reduce)
counts.coalesce(1).saveAsTextFile(output)
示例7: index
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def index(file_name, output="spark-wc-out-index"):
sc = SparkContext("local[8]", "Index")
file = sc.sequenceFile(file_name)
indices = file.flatMap(flat_map) \
.map(map) \
.reduceByKey(reduce)
indices.coalesce(1).saveAsTextFile(output)
示例8: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def main(argv=None):
"""this is called if run from command line"""
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--excludeTags", help="Comma-separated list of tags to exclude.", required=False)
parser.add_argument("--includeTags", help="Comma-separated list of tags to include.", required=False)
parser.add_argument("-i", "--input", help="Seq or tuple input file.", required=True)
parser.add_argument("--inputTuples", help="The input file is in tuple format.", required=False, action="store_true")
parser.add_argument("-o", "--output", help="UTF-8 output file on cluster.", required=False)
parser.add_argument("-p", "--printToLog", help="Print results to log.", required=False, action="store_true")
args = parser.parse_args()
if args.excludeTags and args.includeTags:
print "Pick either --excludeTags or --includeTags, not both."
return 1
sc = SparkContext()
global goodJsonRecords, badJsonRecords, excludedTagCount, includedTagCount, tokenCount
goodJsonRecords = sc.accumulator(0)
badJsonRecords = sc.accumulator(0)
excludedTagCount = sc.accumulator(0)
includedTagCount = sc.accumulator(0)
tokenCount = sc.accumulator(0)
if args.inputTuples:
data = sc.textFile(args.input).map(lambda x: eval(x))
else:
data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
tagPhraseCounts = data.values().flatMap(getPhrasesMaker(args.includeTags, args.excludeTags)).countByValue()
sc.stop()
# So far, this code isn't useful. The output fiile is written by the
# master node into an isolated folder, and I don't know of a way to
# retrieve it.
if args.output != None:
with codecs.open(args.output, "wb", "utf-8") as f:
for k in sorted(tagPhraseCounts):
f.write(k + " " + str(tagPhraseCounts[k]) + "\n")
print "========================================"
print "goodJsonRecords = %d" % goodJsonRecords.value
print "badJsonRecords = %d" % badJsonRecords.value
print "excludedTagCount = %d" % excludedTagCount.value
print "includedTagCount = %d" % includedTagCount.value
print "tokenCount = %d" % tokenCount.value
if args.printToLog:
for k in sorted(tagPhraseCounts):
print json.dumps(k), tagPhraseCounts[k]
print "========================================"
示例9: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def main(argv=None):
"""this is called if run from command line"""
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", help="Required Seq input file on cluster.", required=True)
args = parser.parse_args()
sc = SparkContext()
data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
recordCount = data.count()
print "========================================"
print recordCount
print "========================================"
sc.stop()
示例10: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def main(argv=None):
'''this is called if run from command line'''
parser = argparse.ArgumentParser()
parser.add_argument('-i','--input', help="Required Seq input file on cluster.", required=True)
args = parser.parse_args()
sc = SparkContext()
data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
dataWithGoodJson = data.filter(goodJsonFilter)
recordCount = dataWithGoodJson.count()
print "========================================"
print recordCount
print "========================================"
sc.stop()
示例11: wordcount
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def wordcount(file_name, output="spark-wc-out-wordcount"):
sc = SparkContext("local[8]", "WordCount")
""" Reads in a sequence file FILE_NAME to be manipulated """
file = sc.sequenceFile(file_name)
"""
- flatMap takes in a function that will take one input and outputs 0 or more
items
- map takes in a function that will take one input and outputs a single item
- reduceByKey takes in a function, groups the dataset by keys and aggregates
the values of each key
"""
counts = file.flatMap(flat_map) \
.map(map) \
.reduceByKey(reduce)
""" Takes the dataset stored in counts and writes everything out to OUTPUT """
counts.coalesce(1).saveAsTextFile(output)
示例12: artist_user_matrix
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def artist_user_matrix(file_name, output="artist_user_matrix.out"):
sc = SparkContext("local[8]", "UserArtistMatrix")
""" Reads in a sequence file FILE_NAME to be manipulated """
file = sc.sequenceFile(file_name)
"""
- flatMap takes in a function that will take one input and outputs 0 or more
items
- map takes in a function that will take one input and outputs a single item
- reduceByKey takes in a function, groups the dataset by keys and aggregates
the values of each key
"""
counts = file.flatMap(flat_Map) \
.map(map) \
.reduceByKey(reduce) \
.sortByKey(keyfunc=lambda k: int(k))
""" Takes the dataset stored in counts and writes everything out to OUTPUT """
counts.map(lambda x: x[0] + ' ' + x[1]).coalesce(1).saveAsTextFile(output)
示例13: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def main(argv=None):
"""this is called if run from command line"""
parser = argparse.ArgumentParser()
parser.add_argument(
"-c", "--count", help="Optionally report a count of records extracted.", required=False, action="store_true"
)
parser.add_argument("-i", "--input", help="Required Seq input file on cluster.", required=True)
parser.add_argument("-k", "--key", help="Required extraction key.", required=True)
parser.add_argument(
"-s", "--sample", type=int, default=0, help="Optionally print a sample of results.", required=False
)
args = parser.parse_args()
extractionKey = args.key
def extractValues(value):
try:
d = json.loads(value)
if extractionKey in d:
return iter([d[extractionKey]])
else:
return iter([])
except:
return iter([])
sc = SparkContext()
data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
extractedValuePairs = data.flatMapValues(extractValues)
if args.count:
recordCount = extractedValuePairs.count()
print "========================================"
print recordCount
print "========================================"
if args.sample > 0:
sampleSet = extractedValuePairs.take(args.sample)
print "========================================"
for record in sampleSet:
print record
print "========================================"
示例14: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
def main(argv):
inputSequenceDir = ""
outputSequenceDir = ""
try:
opts, args = getopt.getopt(argv,"i:o:")
except getopt.GetoptError:
sys.exit(2)
for (opt,arg) in opts :
if opt == '-i':
inputSequenceDir = arg
elif opt == '-o' :
outputSequenceDir = arg
sc = SparkContext(appName="Fix XML App")
datarawRDD = sc.sequenceFile(inputSequenceDir)
cleanedRDD = datarawRDD.map(lambda x : trim(x))
outputFormatClassName = "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat"
conf1= {"mapreduce.output.fileoutputformat.compress": "true",
"mapreduce.output.fileoutputformat.compress.codec":"org.apache.hadoop.io.compress.DefaultCodec",
"mapreduce.output.fileoutputformat.compress.type":"RECORD"}
cleanedRDD.saveAsNewAPIHadoopFile(outputSequenceDir,outputFormatClassName,"org.apache.hadoop.io.Text","org.apache.hadoop.io.Text",None,None,conf1)
print "OK Bye Bye"
示例15: filter_and_split
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import sequenceFile [as 别名]
#create hashes and reduce by key
dict = document_terms.flatMap(lambda terms: [(t, self.indexOf(t)) for t in terms]).reduceByKey(lambda a, b: a)
return dict
def filter_and_split(text):
delims = u"\r\n\t.,;:'\"()?!$#-0123456789/*%<>@[]+`~_=&^ "
translate_table = dict((ord(char), u" ") for char in delims)
return text.lower().strip().translate(translate_table).split(" ")
# init the spark context
if "sc" not in globals():
sc = SparkContext( appName="TF-IDF")
# Load documents (one per line).
documents = sc.sequenceFile(docs_dir).map(lambda (fname, content): filter_and_split(content))
documents.cache()
# # keep only the content (replace, lower, split, etc)
# documents = documents.
hashingTF = myHashingTF()
# create the tf vectors
tf = hashingTF.transform(documents)
# create the idf vectors
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
#save
tfidf.saveAsTextFile(d_out)