本文整理汇总了Python中org.apache.pig.scripting.Pig.fs方法的典型用法代码示例。如果您正苦于以下问题:Python Pig.fs方法的具体用法?Python Pig.fs怎么用?Python Pig.fs使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.pig.scripting.Pig
的用法示例。
在下文中一共展示了Pig.fs方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import fs [as 别名]
def __init__(self, params):
# BIND and RUN
self.params = params
self.set_param_defaults()
Pig.fs("rmr " + self.params['output_name'])
generator = PigScriptGenerator.PigScriptGenerator(self.params)
full_script = generator.generate()
P = Pig.compile( full_script )
results = P.bind({
'output':self.params['output_name'],
}).runSingle()
if results.isSuccessful() :
print 'Pig job succeeded'
else :
raise 'Pig job failed'
result_iter = results.result("final_set").iterator()
#This takes care of turning our iter into something we can use.
self.make_dict_from_results(result_iter)
send_to_grapht = raw_input('do you want to send this data to grapht?')
if send_to_grapht not in ('y', 'yes', '1'):
sys.exit()
connector = GraphtConnector('grapht.shuttercorp.net')
metric = self.params['output_name']
connector.record_data_points(metric, self.result)
示例2: run
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import fs [as 别名]
def run(self):
print "%s: %s" % (self.script_name, self.description)
stats = self.bound_script.runSingle()
if stats.isSuccessful():
Pig.fs("touchz %s" % self.flag_file_path)
else:
raise Exception("\nScript %s failed! Error should be logged above.\n" % self.script_name +
"Once you have fixed the problem, you can restart the workflow at this step " +
"using the argument \"-p CHECKPOINT=%s\"" % self.script_name)
示例3: runbidi
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import fs [as 别名]
def runbidi(src, fdest):
P = Pig.compileFromFile('src/main/pig/bidi.pig')
cntsbase = 'counts'
Pig.fs('rmr ' + cntsbase)
for count in range(10):
dest = fdest + 'gm%04d' % count
Pig.fs('rmr ' + dest)
cnts = cntsbase
params = {'src':src, 'dest':dest, 'cnts':cnts}
bound = P.bind(params)
job = bound.runSingle()
if not job.isSuccessful():
raise 'failed'
src = dest
iter = job.result('S').iterator()
if iter.hasNext():
Pig.fs('rmr ' + cnts)
else:
Pig.fs('mv ' + dest + ' ' + fdest)
print 'ALL DONE!'
break
示例4: import_logs
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import fs [as 别名]
def import_logs(profile):
""" Import all the log files for a given day and processed them putting each in a log dir.
If the profile is a list there are multiple files otherwise only a single one.
The files are combined when running web_load.pig
"""
#Clean up any left over files from the last run
for logfile in profile:
Pig.fs('rmr %s/%s' % (logfile['TMPDIR'], logfile['NAME']))
pload = Pig.compileFromFile('web_import.pig')
bload = pload.bind(profile)
load = bload.run()
#Check for load errors
if isinstance(load, org.apache.pig.tools.pigstats.SimplePigStats):
if not load.isSuccessful():
print 'Error in web log load, %s' % load.getErrorMessage()
sys.exit(1)
else:
for run in load:
if not run.isSuccessful():
print 'Error in web log load, %s' % run.getErrorMessage()
sys.exit(1)
示例5: main
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import fs [as 别名]
def main(argv=None):
#Ideally I want to use arguments, ie 'pig -l /var/log/pig web_process.py /etc/rgpig/www.iresis.com.py daily'
#however it just doesn't work, I'm not sure why the code has been applied in my version, and I can get it to
#work with a test .py that only has two lines, import sys, and print sys.argv. Here is the case
#https://issues.apache.org/jira/browse/PIG-2548
# if argv is None:
# argv = sys.argv
# if len(argv) != 3:
# print "Usage: " + argv[0] + " <profile config> <daily|weekly|monthly>"
# return 1
#
# profile_file = argv[1]
# timeframe = argv[2]
profile_file = os.environ['config_file']
timeframe = os.environ['timeframe']
if not (timeframe == 'daily' or timeframe == 'weekly' or timeframe == 'monthly'):
print 'The time frame must be either daily, weekly or monthly.'
return 1
#Load the config
profile = {}
execfile(profile_file, {'timeframe':timeframe}, profile)
#Clean up incomplete runs and create dir
Pig.fs('rmr ' + profile['REPORTDIR'])
Pig.fs('mkdir ' + profile['REPORTDIR'])
#Start pig processing
pig_init()
if timeframe == 'daily':
#Clean up incomplete runs and create dir
Pig.fs('rmr %s' % profile['LOGDIR'])
Pig.fs('mkdir %s' % profile['LOGDIR'])
import_logs(profile['logs'])
#The web_load.pig script is run by the processing scripts
pstats = Pig.compileFromFile('web_%s.pig' % timeframe)
bstats = pstats.bind(profile)
stats = bstats.run()
if isinstance(stats, org.apache.pig.tools.pigstats.SimplePigStats):
if not stats.isSuccessful():
print 'Error in web log stats, %s' % run.getErrorMessage()
sys.exit(1)
else:
for run in stats:
if not run.isSuccessful():
print 'Error in web log stats, %s' % run.getErrorMessage()
sys.exit(1)
示例6: sqrt
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import fs [as 别名]
distance_move = distance_move + sqrt(x_move + y_move)
print distance_move
new_centroid = (x, y)
centroids.append(new_centroid)
initial_centroids = initial_centroids + str(x) + "," + str(y)
if i != k - 1:
initial_centroids = initial_centroids + ":"
iter_num = iter_num + 1
distance_move = distance_move / k
if distance_move > tolerance:
Pig.fs("rmr grouped")
Pig.fs("rmr output")
print ("iteration " + str(iter_num))
print ("average distance moved: " + str(distance_move))
if distance_move <= tolerance:
sys.stdout.write("k-means converged at centroids: [")
sys.stdout.write(",".join(str(v) for v in centroids))
sys.stdout.write("]\n")
converged = True
break
last_centroids = centroids
print last_centroids
print initial_centroids
示例7: range
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import fs [as 别名]
iter_num = 0
while iter_num<MAX_ITERATION:
Q = P.bind({'centroids':initial_centroids})
results = Q.runSingle()
if results.isSuccessful() == "FAILED":
raise "Pig job failed"
iter = results.result("result").iterator()
centroids = [None] * k
distance_move = 0
# get new centroid of this iteration, caculate the moving distance with last iteration
for i in range(k):
tuple = iter.next()
centroids[i] = float(str(tuple.get(1)))
distance_move = distance_move + fabs(last_centroids[i]-centroids[i])
distance_move = distance_move / k;
Pig.fs("rmr output")
print("iteration " + str(iter_num))
print("average distance moved: " + str(distance_move))
if distance_move<tolerance:
sys.stdout.write("k-means converged at centroids: [")
sys.stdout.write(",".join(str(v) for v in centroids))
sys.stdout.write("]\n")
converged = True
break
last_centroids = centroids[:]
initial_centroids = ""
for i in range(k):
initial_centroids = initial_centroids + str(last_centroids[i])
if i!=k-1:
initial_centroids = initial_centroids + ":"
iter_num += 1
示例8: long
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import fs [as 别名]
print "Graph Sampler: starting preprocessing step."
preprocessing = Pig.compileFromFile("../pigscripts/graph_sampler_preprocess.pig").bind({
"GRAPH_INPUT_PATH" : graph,
"GRAPH_OUTPUT_PATH" : preprocess_graph,
"NUM_VERTICES_OUTPUT_PATH" : preprocess_num_vertices
}).runSingle()
iteration_script = Pig.compileFromFile("../pigscripts/graph_sampler_iterate.pig")
num_iterations = nhood_size - 1
num_vertices = long(str(preprocessing.result("num_vertices").iterator().next().get(0)))
print "Graph Sampler: scheduling %d iterations" % num_iterations
for i in range(num_iterations):
print "Graph Sampler: starting iteration step %d" % (i+1)
iteration = iteration_script.bind({
"VERTICES_INPUT_PATH" : seed_vertices if i == 0 else (iteration_verts_prefix + str(i-1)),
"GRAPH_INPUT_PATH" : preprocess_graph,
"VERTICES_OUTPUT_PATH" : iteration_verts_prefix + str(i)
}).runSingle()
iteration_result = iteration_verts_prefix + str(i)
print "Graph Sampler: starting postprocessing step."
postprocessing = Pig.compileFromFile("../pigscripts/graph_sampler_postprocess.pig").bind({
"GRAPH_INPUT_PATH" : graph,
"VERTICES_INPUT_PATH" : iteration_result,
"SAMPLE_OUTPUT_PATH" : output_path,
}).runSingle()
print "Graph Sampler: deleting temporary output directory"
Pig.fs("rmr " + tmp_dir)
示例9: run_pagerank
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import fs [as 别名]
#.........这里部分代码省略.........
damping_factor=0.85,
convergence_threshold=0.0001,
max_num_iterations=10,
id_name_map=None,
preprocessing_script="../pigscripts/pagerank_preprocess.pig",
iteration_script="../pigscripts/pagerank_iterate.pig"
):
"""
Calculates pageranks for directed graph of nodes and edges.
Three main steps:
1. Preprocessing: Process input data to:
a) Count the total number of nodes.
b) Prepare initial pagerank values for all nodes.
2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the
nodes with edges going into the given node.
3. Postprocessing: Order nodes by pagerank
Optionally join (id, pagerank) pairs to a dataset of (id, name) pairs
to get human-readable names
"""
preprocess_dir = "%s/preprocess" % tmp_output_dir
iteration_dir = "%s/iteration" % tmp_output_dir
# Preprocessing step:
print "Starting preprocessing step."
preprocess = Pig.compileFromFile("../pigscripts/pagerank_preprocess.pig").bind({
"INPUT_PATH" : edges_input,
"PAGERANKS_OUTPUT_PATH" : "%s/pageranks" % preprocess_dir,
"NUM_NODES_OUTPUT_PATH" : "%s/num_nodes" % preprocess_dir
}).runSingle()
# Update convergence threshold based on the size of the graph (number of nodes)
num_nodes = long(str(preprocess.result("num_nodes").iterator().next().get(0)))
convergence_threshold = long(convergence_threshold * num_nodes * num_nodes)
print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold)
# Iteration step:
def iteration_param_func(it_num, it_dir):
if it_num == 1:
iteration_input = "%s/pageranks" % preprocess_dir
else:
iteration_input = "%s/%d/pageranks" % (it_dir, it_num - 1)
return {
"INPUT_PATH" : iteration_input,
"DAMPING_FACTOR" : damping_factor,
"NUM_NODES" : num_nodes,
"PAGERANKS_OUTPUT_PATH" : "%s/%d/pageranks" % (it_dir, it_num),
"AGG_RANK_CHANGE_OUTPUT_PATH" : "%s/%d/rank_changes" % (it_dir, it_num)
}
iteration_result = IterationUtils.iterate_until_convergence(
"../pigscripts/pagerank_iterate.pig", # the pigscript to iterate
iteration_dir, # temporary iteration outputs will be stored here
iteration_param_func, # takes iteration #, returns Pig parameter dictionary
"Sum of ordering-rank changes", # name of the convergence metric
int, # Python type of the convergence metric
"aggregate_rank_change", # alias in the pigscript where the metric is stored to
convergence_threshold, # stop when metric less than this
max_num_iterations # or if this many iterations have been performed
)
# Postprocesing step:
print "Starting postprocessing step."
postprocess_script = """
pageranks = LOAD '$PAGERANKS_INPUT_PATH' USING PigStorage() AS (id: int, pagerank: double);
pageranks = FILTER pageranks BY pagerank IS NOT NULL;
"""
if id_name_map:
postprocess_script += """
id_name_map = LOAD '$ID_NAME_MAP_INPUT_PATH' USING PigStorage() AS (id: int, name: chararray);
with_names = FOREACH (JOIN id_name_map BY id, pageranks BY id) GENERATE name, pagerank;
ordered = ORDER with_names BY pagerank DESC;
rmf $OUTPUT_PATH;
STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
"""
postprocess = Pig.compile(postprocess_script).bind({
"PAGERANKS_INPUT_PATH" : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
"ID_NAME_MAP_INPUT_PATH" : id_name_map,
"OUTPUT_PATH" : output_path
}).runSingle()
else:
postprocess_script += """
ordered = ORDER pageranks BY pagerank DESC;
rmf $OUTPUT_PATH;
STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
"""
postprocess = Pig.compile(postprocess_script).bind({
"PAGERANKS_INPUT_PATH" : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
"OUTPUT_PATH" : output_path
}).runSingle()
Pig.fs("rmr %s" % preprocess_dir)
Pig.fs("rmr %s" % iteration_dir)
示例10: xrange
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import fs [as 别名]
from org.apache.pig.scripting import Pig
from org.codehaus.jackson.map import ObjectMapper
EPS = 10e-6 # maximum distance between consective weights for convergence
pig_script = sys.argv[1] # pig script to run iteratively
data_dir = sys.argv[2] # directory where intermediate weights will be written
features = sys.argv[3] # location, inside data_dir, where the data to fit exists
num_features = sys.argv[4] # number of features
#
# Cleanup data dir
#
cmd = "rmr %s/weight-*" % data_dir
Pig.fs(cmd)
#
# Initialize weights
#
w0_fields = []
weights = []
for i in xrange(int(num_features)):
weights.append(str(random.random()))
w0_fields.append({"name":"w%s" % i,"type":25,"schema":None}) # See Pig's DataType.java
path = tempfile.mkdtemp()
w0 = open("%s/part-r-00000" % path, 'w')
w0.write("\t".join(weights)+"\n")
w0.close()
示例11: int
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import fs [as 别名]
hdfsInputDir = sys.argv[2]
print initialCentroidsFile
k = 4
numOfCentroids = int(sys.argv[3])
numOfReducer = str(sys.argv[4])
# numOfMapper = str(sys.argv[5])
tolerance = 0.01
MAX_ITERATION = int(sys.argv[5])
initial_centroids = ""
# only support local path currently
hdfsCentroidFilePath = "/tmp/"+initialCentroidsFile
cachedCentroidFilePath = hdfsCentroidFilePath+"#"+initialCentroidsFile
Pig.fs("rm -r "+hdfsCentroidFilePath)
Pig.fs("put "+initialCentroidsFile+" " + hdfsCentroidFilePath)
#print initial_centroids
pigScript = ("""SET default_parallel """+numOfReducer+""";
SET pig.noSplitCombination true;
-- set mapred.child.java.opts '-Xmx900m';
set mapred.map.tasks.speculative.execution false;
SET mapred.cache.files """+cachedCentroidFilePath+""";
register pig-kmeans-udf-yarn.jar;
-- DEFINE find_centroid FindCentroid('$centroids');
raw = load '"""+hdfsInputDir+"""' using BinaryDataLoader('$centroids','"""+str(numOfCentroids)+"""') as (datapoints);
-- line below may be the bottleneck
datapointbag = foreach raw generate FLATTEN(datapoints) as datapointInString:chararray;
示例12:
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import fs [as 别名]
_out = _in + '_counts_m' + _min_count
_out_nc = _out + '/count'
_out_v = _out + '/vocab'
_out_nf = _out + '/nfollow'
_out_np = _out + '/nprecede'
_out_nfp = _out + '/nfollowerprecede'
_out_njc = _out + '/countsjoined'
##
# start actual pig jobs
#
from org.apache.pig.scripting import Pig
# if output path does not exist, create it
if Pig.fs('-test -d ' + _out):
Pig.fs('mkdir ' + _out)
##
# CountJob
#
# if output path of countjob already exists, skip it, run job
##
if not Pig.fs('-test -d ' + _out_nc):
print '\nPath ("%s") already exists, skipping job.\n' % _out_nc
else:
result = Pig.compile(_header + """
count_ngrams( '${in}', '${out}', '${min_count}' );
""").bind({'in':_in, 'out':_out_nc, 'min_count': _min_count, 'n':'count-ngrams'}).runSingle()
# check the result
if not result.isSuccessful():
示例13: str
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import fs [as 别名]
edges_in = 'edges' + str(i - 1) + '.tmp'
edges_out = ''
while True:
print "*** Iteration " + str(i) + " ***"
edges_out = 'edges' + str(i) + '.tmp'
iteration_bound = iteration.bind({'EDGES_IN': edges_in, 'EDGES_OUT': edges_out,
'CONVERGENCE_OUT': 'convergence.tmp'})
iteration_stats = iteration_bound.runSingle()
if not iteration_stats.isSuccessful():
raise 'Iteration failed'
conv_result = iteration_stats.result('convergence').iterator().next()
max_iter = int(str(conv_result.get(0)))
conv_iter = int(str(conv_result.get(1)))
change_count = int(str(conv_result.get(2)))
Pig.fs('rm -r ' + 'convergence.tmp')
Pig.fs('rm -r ' + edges_in)
edges_in = edges_out
print "Decision change count: " + str(change_count)
if change_count == 0:
stable_iterations += 1
else:
stable_iterations = 0
print "Stable iterations: " + str(stable_iterations)
print "Convergence iterations: " + str(conv_iter)
print "Max iterations: " + str(max_iter)
if stable_iterations >= conv_iter:
print "Stopping due to convergence"
break
if i >= max_iter:
print "Stopping due to max iterations reached"
示例14: xrange
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import fs [as 别名]
import tempfile
from org.apache.pig.scripting import Pig
EPS = 10e-6 # maximum distance between consective weights for convergence
pig_script = sys.argv[1] # pig script to run iteratively
data_dir = sys.argv[2] # directory where intermediate weights will be written
features = sys.argv[3] # location, inside data_dir, where the data to fit exists
num_features = sys.argv[4] # number of features
#
# Cleanup data dir
#
cmd = "rmr %s/weight-*" % data_dir
Pig.fs(cmd)
#
# Initialize weights
#
weights = []
for _ in xrange(int(num_features)):
weights.append(str(random.random()))
fd, path = tempfile.mkstemp()
f = open(path, 'w')
f.write("\t".join(weights)+"\n")
f.close()
os.close(fd)
copyFromLocal = "copyFromLocal %s %s/%s" % (path, data_dir, "weight-0")
示例15: basename
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import fs [as 别名]
if (len(sys.argv) <= 4):
docs_in = sys.argv[2]
if (len(sys.argv) <= 4):
start_at = (int)(sys.argv[3])
else:
start_at = 0
out_dir = "%s/tmp/%s" % (dataset, basename(preprocessedGraph))
inputType = "chararray" #use long if we have hashed urls
for i in range(20):
if i < start_at:
continue
docs_out = out_dir + "pagerank_data_" + str(i + 1)
max_diff = out_dir + "max_diff_" + str(i + 1)
Pig.fs("rmr " + docs_out)
Pig.fs("rmr " + max_diff)
stats = P.bind().runSingle()
if not stats.isSuccessful():
raise 'failed'
max_diff_value = float(str(stats.result("max_diff").iterator().next().get(0)))
print " max_diff_value = " + str(max_diff_value)
if max_diff_value < 0.01:
print "done at iteration " + str(i) + ". Cleaning output"
break
#max_diff of previous iterations never used, so clean it up
Pig.fs("rmr " + max_diff)
if i > 1:
#never for 1st iteration! (otherwise we delete original input...
Pig.fs("rmr " + docs_in)