本文整理汇总了Python中pyspark.SparkContext.accumulator方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.accumulator方法的具体用法?Python SparkContext.accumulator怎么用?Python SparkContext.accumulator使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkContext
的用法示例。
在下文中一共展示了SparkContext.accumulator方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: SearchTiles_and_Factorize
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import accumulator [as 别名]
def SearchTiles_and_Factorize(n):
global globalmergedtiles
global globalcoordinates
global factors_accum
global spcon
spcon = SparkContext("local[4]","Spark_TileSearch_Optimized")
if persisted_tiles == True:
tileintervalsf=open("/home/shrinivaasanka/Krishna_iResearch_OpenSource/GitHub/asfer-github-code/cpp-src/miscellaneous/DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.tileintervals","r")
tileintervalslist=tileintervalsf.read().split("\n")
#print "tileintervalslist=",tileintervalslist
tileintervalslist_accum=spcon.accumulator(tilesintervalslist, VectorAccumulatorParam())
paralleltileintervals=spcon.parallelize(tileintervalslist)
paralleltileintervals.foreach(tilesearch)
else:
factorsfile=open("DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.factors","w")
hardy_ramanujan_ray_shooting_queries(n)
hardy_ramanujan_prime_number_theorem_ray_shooting_queries(n)
baker_harman_pintz_ray_shooting_queries(n)
cramer_ray_shooting_queries(n)
zhang_ray_shooting_queries(n)
factors_accum=spcon.accumulator(factors_of_n, FactorsAccumulatorParam())
#spcon.parallelize(xrange(1,n)).foreach(tilesearch_nonpersistent)
spcon.parallelize(spcon.range(1,n).collect()).foreach(tilesearch_nonpersistent)
print "factors_accum.value = ", factors_accum.value
factors=[]
factordict={}
for f in factors_accum.value:
factors += f
factordict[n]=factors
json.dump(factordict,factorsfile)
return factors
开发者ID:shrinivaasanka,项目名称:asfer-github-code,代码行数:36,代码来源:DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.py
示例2: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import accumulator [as 别名]
def main(argv=None):
'''this is called if run from command line'''
parser = argparse.ArgumentParser()
parser.add_argument('-i','--input', help="Required Seq input file on cluster.", required=True)
args = parser.parse_args()
sc = SparkContext()
global goodJsonRecords, badJsonRecords, noPublisherRecords, noPublisherNameRecords
goodJsonRecords = sc.accumulator(0)
badJsonRecords = sc.accumulator(0)
noPublisherRecords = sc.accumulator(0)
noPublisherNameRecords = sc.accumulator(0)
data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
keyCounts = data.values().flatMap(getKeys).countByValue()
print "========================================"
print "goodJsonRecords = %d" % goodJsonRecords.value
print "badJsonRecords = %d" % badJsonRecords.value
print "noPublisherRecords = %d" % noPublisherRecords.value
print "noPublisherNameRecords = %d" % noPublisherNameRecords.value
for k in sorted(keyCounts):
print k, keyCounts[k]
print "========================================"
sc.stop()
示例3: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import accumulator [as 别名]
def main(argv=None):
'''this is called if run from command line'''
parser = argparse.ArgumentParser()
parser.add_argument('-i','--input', help="Seq input file on cluster.", required=True)
parser.add_argument('-o','--output', help="UTF-8 output file on cluster.", required=False)
parser.add_argument('-p','--printToLog', help="Print results to log.", required=False, action='store_true')
args = parser.parse_args()
sc = SparkContext()
global goodJsonRecords, badJsonRecords
goodJsonRecords = sc.accumulator(0)
badJsonRecords = sc.accumulator(0)
data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
tagCounts = data.values().flatMap(getTokens).countByValue()
# So far, this code isn't useful. The output fiile is written by the
# master node into an isolated folder, and I don't know of a way to
# retrieve it.
if args.output != None:
with codecs.open(args.output, 'wb', 'utf-8') as f:
for k in sorted(tagCounts):
f.write(k + " " + str(tagCounts[k]) + "\n")
print "========================================"
print "goodJsonRecords = %d" % goodJsonRecords.value
print "badJsonRecords = %d" % badJsonRecords.value
if args.printToLog:
for k in sorted(tagCounts):
print json.dumps(k), tagCounts[k]
print "========================================"
示例4: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import accumulator [as 别名]
def main(argv=None):
'''this is called if run from command line'''
parser = argparse.ArgumentParser()
parser.add_argument('-i','--input', help="Seq input file on cluster.", required=True)
args = parser.parse_args()
sc = SparkContext()
global goodJsonRecords, badJsonRecords
goodJsonRecords = sc.accumulator(0)
badJsonRecords = sc.accumulator(0)
data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
tagTokenCounts = data.values().flatMap(getTokens).countByValue()
sc.stop()
print "========================================"
print "goodJsonRecords = %d" % goodJsonRecords.value
print "badJsonRecords = %d" % badJsonRecords.value
print "========================================"
# Restructure the data, grouping by tag (token type indicator):
tagTokenLists = {}
for tagToken in tagTokenCounts.keys():
(tag, tokenValue) = tagToken.split(":", 1)
count = tagTokenCounts[tagToken]
if tag not in tagTokenLists:
tagTokenLists[tag] = []
tagTokenLists[tag].append(Token(tokenValue, count))
# Process each tag seperately:
for tag in tagTokenLists.keys():
tokenList = tagTokenLists[tag]
# Sort the tokens by descending count and ascending token value:
sortedTokenList = sorted(tokenList, key=attrgetter("value"))
sortedTokenList = sorted(sortedTokenList, key=attrgetter("count"), reverse=True)
# Calculate the cumulative token count for each token in sorted order:
totalTokens = 0
for token in sortedTokenList:
totalTokens += token.count
token.cumulativeCount = totalTokens
# We'll use the final total later, but we need it as a float to ensure
# floating point division is used:
floatTotalTokens = float(totalTokens)
# Print the sorted tokens with cumulative counts, fraction of
# total (cunumative distribution function), and index
# (enumerate the tokens per tag, starting with 1).
print "========================================"
tokenIndex = 0
for token in sortedTokenList:
tokenIndex += 1
fractionOfTotal = token.cumulativeCount / floatTotalTokens
print("{0:8d} {1:50} {2:10d} {3:10d} {4:.5f}".format(tokenIndex, json.dumps(tag + ": " + token.value),
token.count, token.cumulativeCount, fractionOfTotal))
print "========================================"
示例5: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import accumulator [as 别名]
def main(argv=None):
"""this is called if run from command line"""
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--excludeTags", help="Comma-separated list of tags to exclude.", required=False)
parser.add_argument("--includeTags", help="Comma-separated list of tags to include.", required=False)
parser.add_argument("-i", "--input", help="Seq or tuple input file.", required=True)
parser.add_argument("--inputTuples", help="The input file is in tuple format.", required=False, action="store_true")
parser.add_argument("-o", "--output", help="UTF-8 output file on cluster.", required=False)
parser.add_argument("-p", "--printToLog", help="Print results to log.", required=False, action="store_true")
args = parser.parse_args()
if args.excludeTags and args.includeTags:
print "Pick either --excludeTags or --includeTags, not both."
return 1
sc = SparkContext()
global goodJsonRecords, badJsonRecords, excludedTagCount, includedTagCount, tokenCount
goodJsonRecords = sc.accumulator(0)
badJsonRecords = sc.accumulator(0)
excludedTagCount = sc.accumulator(0)
includedTagCount = sc.accumulator(0)
tokenCount = sc.accumulator(0)
if args.inputTuples:
data = sc.textFile(args.input).map(lambda x: eval(x))
else:
data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
tagPhraseCounts = data.values().flatMap(getPhrasesMaker(args.includeTags, args.excludeTags)).countByValue()
sc.stop()
# So far, this code isn't useful. The output fiile is written by the
# master node into an isolated folder, and I don't know of a way to
# retrieve it.
if args.output != None:
with codecs.open(args.output, "wb", "utf-8") as f:
for k in sorted(tagPhraseCounts):
f.write(k + " " + str(tagPhraseCounts[k]) + "\n")
print "========================================"
print "goodJsonRecords = %d" % goodJsonRecords.value
print "badJsonRecords = %d" % badJsonRecords.value
print "excludedTagCount = %d" % excludedTagCount.value
print "includedTagCount = %d" % includedTagCount.value
print "tokenCount = %d" % tokenCount.value
if args.printToLog:
for k in sorted(tagPhraseCounts):
print json.dumps(k), tagPhraseCounts[k]
print "========================================"
示例6: SparkBroadcastAccumulator
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import accumulator [as 别名]
def SparkBroadcastAccumulator(n):
global broadcast_var
global accumulator_var
spcon = SparkContext("local[2]","SparkBroadcastAccumulator")
broadcast_var=spcon.broadcast("broadcast_message")
accumulator_var=spcon.accumulator(0)
spcon.parallelize(xrange(1,n)).foreach(lambda x: broadcast_accumulator_receiver(accumulator_var.add(x)))
示例7: longest_common_substring
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import accumulator [as 别名]
def longest_common_substring(strands):
pass
# create the Spark context
conf = SparkConf().setAppName("longest_common_substring")
sc = SparkContext(conf=conf)
# create an accumulator for key-value pairs, where each key is a substring, and each value is the set of strings where the substring can be found
class ArrayAccumulatorParam(AccumulatorParam):
def zero(self, initialValue):
return initialValue
def addInPlace(self, v1, v2):
if type(v2) is list:
v1.extend(v2)
elif type(v2) is tuple:
v1.append(v2)
return v1
acc = sc.accumulator([], ArrayAccumulatorParam())
def generate_substrings(data_element):
k, v = data_element
i = 0
while i < len(v):
j = i + 1
while j < len(v):
acc.add((v[i:j],k))
j += 1
i += 1
sc.parallelize([(k, v) for k, v in strands.iteritems()]).foreach(generate_substrings)
all_substrings = sc.parallelize(acc.value)
return all_substrings.groupByKey().filter(lambda x: set(list(x[1])) == set(strands.keys())).takeOrdered(1, key=lambda x: -len(x[0]))[0][0]
示例8: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import accumulator [as 别名]
def main(argv=None):
'''this is called if run from command line'''
parser = argparse.ArgumentParser()
parser.add_argument('-i','--input', help="Required Seq input file on cluster.", required=True)
args = parser.parse_args()
sc = SparkContext()
global goodJsonRecords, badJsonRecords
goodJsonRecords = sc.accumulator(0)
badJsonRecords = sc.accumulator(0)
data = sc.textFile(args.input).map(lambda x: eval(x))
keyCounts = data.values().flatMap(getKeys).countByValue()
sc.stop()
print "========================================"
print "goodJsonRecords = %d" % goodJsonRecords.value
print "badJsonRecords = %d" % badJsonRecords.value
for k in sorted(keyCounts):
print k, keyCounts[k]
print "========================================"
sc.stop()
示例9: word_count_compute
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import accumulator [as 别名]
def word_count_compute(hdfs_input,hdfs_output,min_n,max_n):
#TODO add start of sent
# TODO large max size
sums = []
vocab_size = []
sc = SparkContext("local","Simple Language Model Computing")
file = sc.textFile(hdfs_input)
counts = file.flatMap(lambda a : get_ngrams(a,min_n,max_n)).reduceByKey(lambda a, b: a + b)
for i in range(min_n, max_n+1):
temp = counts.filter(lambda a: is_ngram(a,i))
accum = sc.accumulator(0)
temp.foreach(lambda a: accum.add(a[1]))
#temp_sum = sum(x[1] for x in temp.collect())
sums.append(accum.value)
temp_counts = temp.count()
vocab_size.append(temp_counts)
#print i,temp_counts,temp_sum
print sums,vocab_size
return counts,sums,vocab_size
示例10: id
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import accumulator [as 别名]
#input data format:
# id0 id01 id02 ...
# id1 id11 id12 ...
#...
# first id (idn) is current node
# following ids (idnm) are nodes connected to idn.
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("BFS")
sc = SparkContext(conf = conf)
hitCounter = sc.accumulator(0)
src_id = 1
dst_id = 6
def parseInput(line):
l = line.split()
v_id = int(l[0])
if v_id == src_id:
v_dist = 0
v_status = 1
else:
v_dist = 9999
v_status = 0
示例11: str
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import accumulator [as 别名]
f = inputFiles[i]
if subFileCount==filePartitionSize:
subDirs.append(subDir)
subDirNum += 1
subFileCount = 0
subDir = str(subDirNum) + "/"
os.makedirs(input_dir + subDir)
shutil.move(input_dir + f,input_dir + subDir)
subFileCount += 1
if subFileCount==filePartitionSize:
subDirs.append(subDir)
sc = SparkContext("local[" + numCores + "]" , "job", pyFiles=[realpath('helper.py')])
eNodeBLoadVec = []
for bs in eNodeBs:
v = sc.accumulator([(0,0,0)]*len(intervals), VectorAccumulatorParamTriple())
eNodeBLoadVec.append(v)
prev_idx = 0
for i in range(len(subDirs)):
d = subDirs[i]
end_idx = intervals.index(dirTimeBoundaries[i])
intervalBoundary = (prev_idx+1,end_idx) #both indexes are included
prev_idx = end_idx
bs2data = sc.textFile(input_dir + d + '*.gz').filter(filterData).map(generateBS2Data).reduceByKey(reduceBS2IMSI2Data)
bs2data.foreach(getBearerLoad)
resetDirectories(subDirs,input_dir)
header = "time "
示例12: CreateHW
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import accumulator [as 别名]
#Create W H V
W, H = CreateHW()
num_users = W.shape[0]
num_movies = H.shape[1]
V = CreateMatrix(num_users, num_movies)
# Initialize sc
conf = SparkConf().setAppName('DSGD').setMaster('local[%d]' % num_workers)
sc = SparkContext(conf=conf)
# Intialize strata
init_strata = [[i, v] for i, v in enumerate(np.random.permutation(num_workers))]
S = sc.parallelize(init_strata)
# Initialize clock
clock = sc.accumulator(0)
# Iteration
for i in xrange(num_iterations) :
# Get rows, cols from strata
split = S.map(GetRowCol).collect()
# Get block from rows, cols
matrices = []
for row, col in split :
V_block = V.tocsr()[row, :].tocsc()[:, col]
W_block = W[row, :].copy()
H_block = H[:, col].copy()
matrices.append((V_block, W_block, H_block))
# Set clock
clk = clock.value
# Calculate gradient
示例13: len
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import accumulator [as 别名]
import sys
from pyspark import SparkContext
if __name__ == "__main__":
if len(sys.argv) != 2:
print >> sys.stderr, "Usage: AverageWordLength <file or directory>"
exit(-1)
sc = SparkContext()
totalWords = sc.accumulator(0)
totalLetters = sc.accumulator(0.0)
words = sc.textFile(sys.argv[1]).flatMap(lambda line: line.split())
def addTotals(word,words,letters):
words +=1
letters += len(word)
words.foreach(lambda word: addTotals(word,totalWords,totalLetters))
print "Average word length:", totalLetters.value/totalWords.value
示例14: len
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import accumulator [as 别名]
a_wenting += 1
elif "dongshen" in element:
a_dongshen += 1
elif "qifeng" in element:
a_qifeng += 1
else:
a_else += 1
return element
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: input <file>")
exit(-1)
sc = SparkContext(appName="accumulatorTest")
# 创建四个统计要素
a_wenting = sc.accumulator(0)
a_dongshen = sc.accumulator(0)
a_qifeng= sc.accumulator(0)
a_else= sc.accumulator(0)
lines = sc.textFile(sys.argv[1], 2).map(fun)
for line in lines.collect():
print line
print a_wenting.value
print a_else.value
sc.stop()
示例15: Counter
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import accumulator [as 别名]
return Counter()
def addInPlace(self, hashdict, items):
hashdict.update(items)
return hashdict
if __name__ == '__main__':
sc = SparkContext(appName="FraMultiStage")
data_input = 's3://progetto-analisi-di-dati-unimi/dataset'
data_output = 's3://progetto-analisi-di-dati-unimi/output_multistage/'
split_by, supp, combsize = ',', 18000, 2
data = sc.textFile(data_input).map(lambda x: sorted(set(x.split(split_by))))
#Converting item names to number
item_to_n = sc.accumulator(dict(), DictAccumulatorParam())
data.foreach(item_to_n.add)
item_to_n = item_to_n.value
data = data.map(lambda x: [item_to_n[i] for i in x])
#Hashmaps and their hash functions
hashmap1 = sc.accumulator(Counter(), HashMapAccumulator())
hashmap2 = sc.accumulator(Counter(), HashMapAccumulator())
#Get frequent items from bucket
def getFreq(bucket): return filter(lambda i: i in freq, bucket)
#Hashmaps functions for each hashmap
def hashf1(x): return sum(x) % 21243
def hashf2(x): return sum(x) % 10621