本文整理汇总了Python中org.apache.pig.scripting.Pig.compileFromFile方法的典型用法代码示例。如果您正苦于以下问题:Python Pig.compileFromFile方法的具体用法?Python Pig.compileFromFile怎么用?Python Pig.compileFromFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.pig.scripting.Pig
的用法示例。
在下文中一共展示了Pig.compileFromFile方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: runbidi
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
def runbidi(src, fdest):
P = Pig.compileFromFile('src/main/pig/bidi.pig')
cntsbase = 'counts'
Pig.fs('rmr ' + cntsbase)
for count in range(10):
dest = fdest + 'gm%04d' % count
Pig.fs('rmr ' + dest)
cnts = cntsbase
params = {'src':src, 'dest':dest, 'cnts':cnts}
bound = P.bind(params)
job = bound.runSingle()
if not job.isSuccessful():
raise 'failed'
src = dest
iter = job.result('S').iterator()
if iter.hasNext():
Pig.fs('rmr ' + cnts)
else:
Pig.fs('mv ' + dest + ' ' + fdest)
print 'ALL DONE!'
break
示例2: run
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
def run (self, params, script_name, script_file, elements = []):
''' Execute pig. '''
pig = Pig.compileFromFile (script_name, script_file)
bound = pig.bind (params)
futures = bound.run () if isinstance (params, list) else bound.runSingle ()
self.handle_future (futures, elements)
self.complete ()
示例3: run_script
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
def run_script():
import os
from org.apache.pig.scripting import Pig
# compile the pig code
P = Pig.compileFromFile("../pigscripts/#{script_name}.pig")
bound = P.bind()
bound.runSingle()
示例4: main
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
def main(argv=None):
#Ideally I want to use arguments, ie 'pig -l /var/log/pig web_process.py /etc/rgpig/www.iresis.com.py daily'
#however it just doesn't work, I'm not sure why the code has been applied in my version, and I can get it to
#work with a test .py that only has two lines, import sys, and print sys.argv. Here is the case
#https://issues.apache.org/jira/browse/PIG-2548
# if argv is None:
# argv = sys.argv
# if len(argv) != 3:
# print "Usage: " + argv[0] + " <profile config> <daily|weekly|monthly>"
# return 1
#
# profile_file = argv[1]
# timeframe = argv[2]
profile_file = os.environ['config_file']
timeframe = os.environ['timeframe']
if not (timeframe == 'daily' or timeframe == 'weekly' or timeframe == 'monthly'):
print 'The time frame must be either daily, weekly or monthly.'
return 1
#Load the config
profile = {}
execfile(profile_file, {'timeframe':timeframe}, profile)
#Clean up incomplete runs and create dir
Pig.fs('rmr ' + profile['REPORTDIR'])
Pig.fs('mkdir ' + profile['REPORTDIR'])
#Start pig processing
pig_init()
if timeframe == 'daily':
#Clean up incomplete runs and create dir
Pig.fs('rmr %s' % profile['LOGDIR'])
Pig.fs('mkdir %s' % profile['LOGDIR'])
import_logs(profile['logs'])
#The web_load.pig script is run by the processing scripts
pstats = Pig.compileFromFile('web_%s.pig' % timeframe)
bstats = pstats.bind(profile)
stats = bstats.run()
if isinstance(stats, org.apache.pig.tools.pigstats.SimplePigStats):
if not stats.isSuccessful():
print 'Error in web log stats, %s' % run.getErrorMessage()
sys.exit(1)
else:
for run in stats:
if not run.isSuccessful():
print 'Error in web log stats, %s' % run.getErrorMessage()
sys.exit(1)
示例5: import_logs
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
def import_logs(profile):
""" Import all the log files for a given day and processed them putting each in a log dir.
If the profile is a list there are multiple files otherwise only a single one.
The files are combined when running web_load.pig
"""
#Clean up any left over files from the last run
for logfile in profile:
Pig.fs('rmr %s/%s' % (logfile['TMPDIR'], logfile['NAME']))
pload = Pig.compileFromFile('web_import.pig')
bload = pload.bind(profile)
load = bload.run()
#Check for load errors
if isinstance(load, org.apache.pig.tools.pigstats.SimplePigStats):
if not load.isSuccessful():
print 'Error in web log load, %s' % load.getErrorMessage()
sys.exit(1)
else:
for run in load:
if not run.isSuccessful():
print 'Error in web log load, %s' % run.getErrorMessage()
sys.exit(1)
示例6: run_script
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
def run_script():
import os
from org.apache.pig.scripting import Pig
# compile the pig code
for i in range(10):
print 'Run %s started!' % i
P = Pig.compileFromFile("../pigscripts/avg_songs_per_split_counter.pig")
bound = P.bind({"ITERATION_NUM":i})
ps = bound.runSingle()
print 'Run %s done!' % i
result = ps.result("avg_split_song_count")
for r in result.iterator():
print r
if int(r.get(1).toString()) >= 5:
print 'Good enough! Quitting time!'
break
示例7: run
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
def run(self):
print project_name + ": " + self.action
compiled = Pig.compileFromFile(self.script)
bound = compiled.bind(self.params)
return bound.runSingle()
示例8: run_pagerank
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
def run_pagerank(self):
"""
Calculates pageranks for directed graph of nodes and edges.
Three main steps:
1. Preprocessing: Process input data to:
a) Count the total number of nodes.
b) Prepare initial pagerank values for all nodes.
2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the
nodes with edges going into the given node.
3. Postprocessing: Find the top pagerank nodes and join to a separate dataset to find their names.
"""
# Preprocessing step:
print "Starting preprocessing step."
preprocess = Pig.compileFromFile(self.preprocessing_script)
preprocess_params = {
"INPUT_PATH": self.edges_input,
"PAGERANKS_OUTPUT_PATH": self.preprocess_pageranks,
"NUM_NODES_OUTPUT_PATH": self.preprocess_num_nodes
}
preprocess_bound = preprocess.bind(preprocess_params)
preprocess_stats = preprocess_bound.runSingle()
# Update convergence threshold based on the size of the graph (number of nodes)
num_nodes = long(str(preprocess_stats.result("num_nodes").iterator().next().get(0)))
convergence_threshold = long(self.convergence_threshold * num_nodes * num_nodes)
print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold)
# Iteration step:
iteration = Pig.compileFromFile(self.iteration_script)
for i in range(self.max_num_iterations):
print "Starting iteration step: %s" % str(i + 1)
# Append the iteration number to the input/output stems
iteration_input = self.preprocess_pageranks if i == 0 else (self.iteration_pageranks_prefix + str(i-1))
iteration_pageranks_output = self.iteration_pageranks_prefix + str(i)
iteration_rank_changes_output = self.iteration_rank_changes_prefix + str(i)
iteration_bound = iteration.bind({
"INPUT_PATH": iteration_input,
"DAMPING_FACTOR": self.damping_factor,
"NUM_NODES": num_nodes,
"PAGERANKS_OUTPUT_PATH": iteration_pageranks_output,
"AGG_RANK_CHANGE_OUTPUT_PATH": iteration_rank_changes_output
})
iteration_stats = iteration_bound.runSingle()
# If we're below the convergence threshold break out of the loop.
aggregate_rank_change = long(str(iteration_stats.result("aggregate_rank_change").iterator().next().get(0)))
if aggregate_rank_change < convergence_threshold:
print "Sum of ordering-rank changes %d under convergence threshold %d. Stopping." \
% (aggregate_rank_change, convergence_threshold)
break
elif i == self.max_num_iterations-1:
print ("Sum of ordering-rank changes %d " % aggregate_rank_change) + \
("above convergence threshold %d but hit max number of iterations. " % convergence_threshold) + \
"Stopping."
else:
print "Sum of ordering-rank changes %d above convergence threshold %d. Continuing." \
% (aggregate_rank_change, convergence_threshold)
iteration_pagerank_result = self.iteration_pageranks_prefix + str(i)
# Postprocesing step:
print "Starting postprocessing step."
postprocess = Pig.compileFromFile(self.postprocessing_script)
postprocess_params = { "PAGERANKS_INPUT_PATH": iteration_pagerank_result }
if self.output_path is not None: # otherwise, the script outputs to the default location,
# which is a special directory in s3://mortar-example-output-data
# permissioned for your Mortar account.
postprocess_params["OUTPUT_PATH"] = self.output_path
postprocess_bound = postprocess.bind(postprocess_params)
postprocess_stats = postprocess_bound.runSingle()
示例9: str
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
import sys
from org.apache.pig.scripting import Pig
load = Pig.compileFromFile(sys.argv[1])
iteration = Pig.compileFromFile('iteration.pig')
store = Pig.compileFromFile('store.pig')
print '*** Loading input ***'
load_stats = load.bind({'EDGES_OUT': 'edges0.tmp'}).runSingle()
if not load_stats.isSuccessful():
raise 'Load failed'
i = 1
stable_inerations = 0
edges_in = 'edges' + str(i - 1) + '.tmp'
edges_out = ''
while True:
print "*** Iteration " + str(i) + " ***"
edges_out = 'edges' + str(i) + '.tmp'
iteration_bound = iteration.bind({'EDGES_IN': edges_in, 'EDGES_OUT': edges_out,
'CONVERGENCE_OUT': 'convergence.tmp'})
iteration_stats = iteration_bound.runSingle()
if not iteration_stats.isSuccessful():
raise 'Iteration failed'
conv_result = iteration_stats.result('convergence').iterator().next()
max_iter = int(str(conv_result.get(0)))
conv_iter = int(str(conv_result.get(1)))
change_count = int(str(conv_result.get(2)))
Pig.fs('rm -r ' + 'convergence.tmp')
Pig.fs('rm -r ' + edges_in)
示例10: runbidi
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
#!/usr/bin/python
import sys
from org.apache.pig.scripting import Pig
from bidipig import runbidi
# make minhash clusters
minhash = Pig.compileFromFile('src/main/pig/minhash.pig')
osrc = src = sys.argv[1]
destminhash = sys.argv[2] + '-minhash'
dest = sys.argv[2] + '-jaccard'
minjaccard = 80
bound = minhash.bind()
job = bound.runSingle()
if not job.isSuccessful():
raise 'failed in minhash'
# output is pairs and scores
# make transitive closure of clusters
src = dest
dest = sys.argv[2] + '-bidi'
runbidi(src, dest)
# join with original data
join = Pig.compileFromFile('src/main/pig/join.pig')
src = osrc
keys = dest
示例11: str
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
print 'LOG: Elapsed %f' % (endTime - startTime)
# Remove the guardFile
fs.delete( guardFile, True )
System.exit(0)
if fs.exists( parsedDir ):
# parsed-captures
if ( not fs.exists( parsedCaptures) or
fs.getFileStatus( parsedDir ).getModificationTime() > fs.getFileStatus( parsedCaptures ).getModificationTime() ):
print 'LOG: Graph parsed-captures create'
fs.delete( parsedCaptures, True )
params = { 'INPUT' : str(parsedDir),
'OUTPUT' : str(parsedCaptures),
'JOBNAME': str(collection) + ' parsed-captures' }
job = Pig.compileFromFile( 'pig/parsed-captures.pig' ).bind( params )
result = job.runSingle(props)
if not result.isSuccessful():
print '\nERROR: Pig job parsed-captures for ' + collection
System.exit(1)
else:
print 'LOG: Graph parsed-captures up-to-date'
# link-graph
if ( not fs.exists( linkGraph ) or
fs.getFileStatus( parsedDir ).getModificationTime() > fs.getFileStatus( linkGraph ).getModificationTime() ):
print 'LOG: Graph link-graph create'
fs.delete( linkGraph, True )
params = { 'INPUT' : str(parsedDir),
'OUTPUT' : str(linkGraph),
'JOBNAME': str(collection) + ' link-graph' }
示例12: run_pagerank
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
def run_pagerank(edges_input,
output_path,
tmp_output_dir,
damping_factor=0.85,
convergence_threshold=0.0001,
max_num_iterations=10,
id_name_map=None,
preprocessing_script="../pigscripts/pagerank_preprocess.pig",
iteration_script="../pigscripts/pagerank_iterate.pig"
):
"""
Calculates pageranks for directed graph of nodes and edges.
Three main steps:
1. Preprocessing: Process input data to:
a) Count the total number of nodes.
b) Prepare initial pagerank values for all nodes.
2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the
nodes with edges going into the given node.
3. Postprocessing: Order nodes by pagerank
Optionally join (id, pagerank) pairs to a dataset of (id, name) pairs
to get human-readable names
"""
preprocess_dir = "%s/preprocess" % tmp_output_dir
iteration_dir = "%s/iteration" % tmp_output_dir
# Preprocessing step:
print "Starting preprocessing step."
preprocess = Pig.compileFromFile("../pigscripts/pagerank_preprocess.pig").bind({
"INPUT_PATH" : edges_input,
"PAGERANKS_OUTPUT_PATH" : "%s/pageranks" % preprocess_dir,
"NUM_NODES_OUTPUT_PATH" : "%s/num_nodes" % preprocess_dir
}).runSingle()
# Update convergence threshold based on the size of the graph (number of nodes)
num_nodes = long(str(preprocess.result("num_nodes").iterator().next().get(0)))
convergence_threshold = long(convergence_threshold * num_nodes * num_nodes)
print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold)
# Iteration step:
def iteration_param_func(it_num, it_dir):
if it_num == 1:
iteration_input = "%s/pageranks" % preprocess_dir
else:
iteration_input = "%s/%d/pageranks" % (it_dir, it_num - 1)
return {
"INPUT_PATH" : iteration_input,
"DAMPING_FACTOR" : damping_factor,
"NUM_NODES" : num_nodes,
"PAGERANKS_OUTPUT_PATH" : "%s/%d/pageranks" % (it_dir, it_num),
"AGG_RANK_CHANGE_OUTPUT_PATH" : "%s/%d/rank_changes" % (it_dir, it_num)
}
iteration_result = IterationUtils.iterate_until_convergence(
"../pigscripts/pagerank_iterate.pig", # the pigscript to iterate
iteration_dir, # temporary iteration outputs will be stored here
iteration_param_func, # takes iteration #, returns Pig parameter dictionary
"Sum of ordering-rank changes", # name of the convergence metric
int, # Python type of the convergence metric
"aggregate_rank_change", # alias in the pigscript where the metric is stored to
convergence_threshold, # stop when metric less than this
max_num_iterations # or if this many iterations have been performed
)
# Postprocesing step:
print "Starting postprocessing step."
postprocess_script = """
pageranks = LOAD '$PAGERANKS_INPUT_PATH' USING PigStorage() AS (id: int, pagerank: double);
pageranks = FILTER pageranks BY pagerank IS NOT NULL;
"""
if id_name_map:
postprocess_script += """
id_name_map = LOAD '$ID_NAME_MAP_INPUT_PATH' USING PigStorage() AS (id: int, name: chararray);
with_names = FOREACH (JOIN id_name_map BY id, pageranks BY id) GENERATE name, pagerank;
ordered = ORDER with_names BY pagerank DESC;
rmf $OUTPUT_PATH;
STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
"""
postprocess = Pig.compile(postprocess_script).bind({
"PAGERANKS_INPUT_PATH" : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
"ID_NAME_MAP_INPUT_PATH" : id_name_map,
"OUTPUT_PATH" : output_path
}).runSingle()
else:
postprocess_script += """
ordered = ORDER pageranks BY pagerank DESC;
rmf $OUTPUT_PATH;
STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
"""
postprocess = Pig.compile(postprocess_script).bind({
"PAGERANKS_INPUT_PATH" : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
"OUTPUT_PATH" : output_path
}).runSingle()
#.........这里部分代码省略.........
示例13:
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
#Passing PIG Script to PYTHON and RUN
#! /usr/bin/python
from org.apache.pig.scripting import Pig
P = Pig.compileFromFile("""myscript.pig""")
input = "original"
output = "output"
result = p.bind({'in':input, 'out':output}).runSingle()
if result.isSuccessful():
print "Pig job succeeded"
else:
raise "Pig job failed"
示例14: run_script
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
def run_script():
import os
from org.apache.pig.scripting import Pig
# Specify where the data will come from,
# and where output data will go after each step
data_stem = "s3n://jpacker-dev/amazon_products/books_graph/"
num_vertices_input = data_stem + "num_vertices"
nodes_input = data_stem + "nodes"
edges_input = data_stem + "edges"
output_stem = data_stem + "clustering/"
preprocess_num_vertices_output = output_stem + "preprocess/num_vertices"
preprocess_trans_mat_output = output_stem + "preprocess/trans_mat"
iteration_trans_mat_output_stem = output_stem + "iteration/trans_mat_"
postprocess_clusters_output = output_stem + "postprocess/clusters"
postprocess_stats_output = output_stem + "postprocess/stats"
"""
data_stem = "../fake-fixtures/"
num_vertices_input = data_stem + "cathedral-num-vertices"
nodes_input = data_stem + "cathedral-nodes"
edges_input = data_stem + "cathedral-edges"
output_stem = data_stem + "cathedral_clustering/"
preprocess_num_vertices_output = output_stem + "preprocess/num_vertices"
preprocess_trans_mat_output = output_stem + "preprocess/trans_mat"
iteration_trans_mat_output_stem = output_stem + "iteration/trans_mat_"
postprocess_clusters_output = output_stem + "postprocess/clusters"
postprocess_stats_output = output_stem + "postprocess/stats"
"""
# Preprocessing step:
#
# (1) Generate a transition matrix from the internal edges
# (2) Copy precomputed count of # vertices
# No computation is being done here; this just lets us use Pig to access the data
# instead of configuring S3 access manually with boto
#
preprocess = Pig.compileFromFile("../pigscripts/clustering_preprocess.pig")
preprocess_bound = preprocess.bind({
"NUM_VERTICES_INPUT_PATH": num_vertices_input,
"EDGES_INPUT_PATH": edges_input,
"NUM_VERTICES_OUTPUT_PATH": preprocess_num_vertices_output,
"TRANS_MAT_OUTPUT_PATH": preprocess_trans_mat_output
})
preprocess_stats = preprocess_bound.runSingle()
# Extract the number of vertices, which we will pass into each iteration as a parameter
num_vertices = long(str(preprocess_stats.result("num_verts").iterator().next().get(0)))
# Extract the number of edges (including inserted self-loops)
# We will use this in our convergence metric
initial_num_edges = long(str(preprocess_stats.getNumberRecords(preprocess_trans_mat_output)))
# Iteration step applying the Markov Clustering operations:
#
# (1) Expansion: square the transition matrix ~= take a step in a random walk
# (2) Inflation: take an elementwise power of the matrix ~= strengthen strong connections, weaken weak ones'
# (3) Pruning: set small matrix values to zero (since the matrix impl is sparse, this greatly speeds things up)
# (4) Normalization: renormalize the matrix columnwise to keep it a valid transition matrix
#
# I tested several mathematically sensible convergence metrics
# (max of max residual for each col, avg of max residual for each col, col kurtosis)
# but none worked very well. So I'm currently just breaking when the number of edges
# in an iteration's transition matrix is less than the number of edges in
# the initial transition matrix times a constant multiple, which seems to indicate
# that things are settling down.
#
# The algorithm has two parameters:
# (1) The inflation parameter is an exponential factor which determines the cluster size. higher inflation => smaller clusters
# (2) Epsilon is a minimum threshold for values in the transition matrix; anything smaller will be pruned (set to zero)
# I am not sure how high epsilon can safely be set without significantly degrading the quality of the algorithm
# If you run in to performance problems though, raising epsilon will dramatically reduce execution time
#
iteration = Pig.compileFromFile("../pigscripts/clustering_iterate.pig")
max_num_iterations = 7 # most graphs should converge after 4-10 iterations
num_iterations = 0
for i in range(1, max_num_iterations + 1):
iteration_input = preprocess_trans_mat_output if i == 1 else (iteration_trans_mat_output_stem + str(i-1))
iteration_output = iteration_trans_mat_output_stem + str(i)
iteration_bound = iteration.bind({
"INPUT_PATH": iteration_input,
"ITERATION_OUTPUT_PATH": iteration_output,
"NUM_VERTICES": num_vertices,
"INFLATION_PARAMETER": 1.5,
"EPSILON": 0.01
})
iteration_stats = iteration_bound.runSingle()
num_iterations += 1
num_edges = long(str(iteration_stats.getNumberRecords(iteration_output)))
if num_iterations >= 3 and num_edges < (initial_num_edges * 1.05):
break
# Postprocessing step:
#
#.........这里部分代码省略.........
示例15: run_pagerank
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
def run_pagerank():
"""
Calculates pageranks for Twitter users.
Three main steps:
1. Preprocessing: Process input data to:
a) Count the total number of users.
b) Prepare initial pagerank values for all users.
2. Iterative: Calculate new pageranks for each user based on the previous pageranks of the
users' followers.
3. Postprocesing: Find the top pagerank users and join to a separate dataset to find their names.
"""
# Preprocessing step:
print "Starting preprocessing step."
preprocess = Pig.compileFromFile(PREPROCESS_SCRIPT)
preprocess_bound = preprocess.bind({
"INPUT_PATH": FOLLOWER_GRAPH_INPUT,
"PAGERANKS_OUTPUT_PATH": PREPROCESS_PAGERANKS,
"NUM_USERS_OUTPUT_PATH": PREPROCESS_NUM_USERS
})
preprocess_stats = preprocess_bound.runSingle()
num_users = int(str(preprocess_stats.result("num_users").iterator().next().get(0)))
convergence_threshold = CONVERGENCE_THRESHOLD / num_users
# Iteration step:
iteration = Pig.compileFromFile(PAGERANK_ITERATE_SCRIPT)
for i in range(MAX_NUM_ITERATIONS):
print "Starting iteration step: %s" % str(i + 1)
# Append the iteration number to the input/output stems
iteration_input = PREPROCESS_PAGERANKS if i == 0 else (ITERATION_PAGERANKS_PREFIX + str(i-1))
iteration_pageranks_output = ITERATION_PAGERANKS_PREFIX + str(i)
iteration_max_diff_output = ITERATION_MAX_DIFF_PREFIX + str(i)
iteration_bound = iteration.bind({
"INPUT_PATH": iteration_input,
"DAMPING_FACTOR": DAMPING_FACTOR,
"NUM_USERS": num_users,
"PAGERANKS_OUTPUT_PATH": iteration_pageranks_output,
"MAX_DIFF_OUTPUT_PATH": iteration_max_diff_output
})
iteration_stats = iteration_bound.runSingle()
# If we're below the convergence_threshold break out of the loop.
max_diff = float(str(iteration_stats.result("max_diff").iterator().next().get(0)))
if max_diff < CONVERGENCE_THRESHOLD:
print "Max diff %s under convergence threshold. Stopping." % max_diff
break
elif i == MAX_NUM_ITERATIONS-1:
print "Max diff %s above convergence threshold but hit max number of iterations. Stopping." \
% max_diff
else:
print "Max diff %s above convergence threshold. Continuing." % max_diff
iteration_pagerank_result = ITERATION_PAGERANKS_PREFIX + str(i)
# Postprocesing step:
print "Starting postprocessing step."
postprocess = Pig.compileFromFile(POSTPROCESS_SCRIPT)
postprocess_bound = postprocess.bind({
"PAGERANKS_INPUT_PATH": iteration_pagerank_result,
"USERNAMES_INPUT_PATH": USERNAMES_INPUT,
"TOP_N": NUM_TOP_USERS,
"OUTPUT_BUCKET": OUTPUT_BUCKET
})
postprocess_stats = postprocess_bound.runSingle()