本文整理汇总了Python中org.apache.pig.scripting.Pig类的典型用法代码示例。如果您正苦于以下问题:Python Pig类的具体用法?Python Pig怎么用?Python Pig使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Pig类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, params):
# BIND and RUN
self.params = params
self.set_param_defaults()
Pig.fs("rmr " + self.params['output_name'])
generator = PigScriptGenerator.PigScriptGenerator(self.params)
full_script = generator.generate()
P = Pig.compile( full_script )
results = P.bind({
'output':self.params['output_name'],
}).runSingle()
if results.isSuccessful() :
print 'Pig job succeeded'
else :
raise 'Pig job failed'
result_iter = results.result("final_set").iterator()
#This takes care of turning our iter into something we can use.
self.make_dict_from_results(result_iter)
send_to_grapht = raw_input('do you want to send this data to grapht?')
if send_to_grapht not in ('y', 'yes', '1'):
sys.exit()
connector = GraphtConnector('grapht.shuttercorp.net')
metric = self.params['output_name']
connector.record_data_points(metric, self.result)
示例2: __init__
def __init__ (self, jars = [], properties = {}):
''' Initialize Pig. '''
for jar in jars:
logger.debug (" >>> register jar: %s", jar)
Pig.registerJar (jar)
for key in properties:
logger.debug (" >>> set property: %s => %s", key, properties[key])
Pig.set (key, properties [key])
示例3: run
def run(self):
print "%s: %s" % (self.script_name, self.description)
stats = self.bound_script.runSingle()
if stats.isSuccessful():
Pig.fs("touchz %s" % self.flag_file_path)
else:
raise Exception("\nScript %s failed! Error should be logged above.\n" % self.script_name +
"Once you have fixed the problem, you can restart the workflow at this step " +
"using the argument \"-p CHECKPOINT=%s\"" % self.script_name)
示例4: runbidi
def runbidi(src, fdest):
P = Pig.compileFromFile('src/main/pig/bidi.pig')
cntsbase = 'counts'
Pig.fs('rmr ' + cntsbase)
for count in range(10):
dest = fdest + 'gm%04d' % count
Pig.fs('rmr ' + dest)
cnts = cntsbase
params = {'src':src, 'dest':dest, 'cnts':cnts}
bound = P.bind(params)
job = bound.runSingle()
if not job.isSuccessful():
raise 'failed'
src = dest
iter = job.result('S').iterator()
if iter.hasNext():
Pig.fs('rmr ' + cnts)
else:
Pig.fs('mv ' + dest + ' ' + fdest)
print 'ALL DONE!'
break
示例5: run
def run (self, params, script_name, script_file, elements = []):
''' Execute pig. '''
pig = Pig.compileFromFile (script_name, script_file)
bound = pig.bind (params)
futures = bound.run () if isinstance (params, list) else bound.runSingle ()
self.handle_future (futures, elements)
self.complete ()
示例6: run_script
def run_script():
import os
from org.apache.pig.scripting import Pig
# compile the pig code
P = Pig.compileFromFile("../pigscripts/#{script_name}.pig")
bound = P.bind()
bound.runSingle()
示例7: main
def main(argv=None):
#Ideally I want to use arguments, ie 'pig -l /var/log/pig web_process.py /etc/rgpig/www.iresis.com.py daily'
#however it just doesn't work, I'm not sure why the code has been applied in my version, and I can get it to
#work with a test .py that only has two lines, import sys, and print sys.argv. Here is the case
#https://issues.apache.org/jira/browse/PIG-2548
# if argv is None:
# argv = sys.argv
# if len(argv) != 3:
# print "Usage: " + argv[0] + " <profile config> <daily|weekly|monthly>"
# return 1
#
# profile_file = argv[1]
# timeframe = argv[2]
profile_file = os.environ['config_file']
timeframe = os.environ['timeframe']
if not (timeframe == 'daily' or timeframe == 'weekly' or timeframe == 'monthly'):
print 'The time frame must be either daily, weekly or monthly.'
return 1
#Load the config
profile = {}
execfile(profile_file, {'timeframe':timeframe}, profile)
#Clean up incomplete runs and create dir
Pig.fs('rmr ' + profile['REPORTDIR'])
Pig.fs('mkdir ' + profile['REPORTDIR'])
#Start pig processing
pig_init()
if timeframe == 'daily':
#Clean up incomplete runs and create dir
Pig.fs('rmr %s' % profile['LOGDIR'])
Pig.fs('mkdir %s' % profile['LOGDIR'])
import_logs(profile['logs'])
#The web_load.pig script is run by the processing scripts
pstats = Pig.compileFromFile('web_%s.pig' % timeframe)
bstats = pstats.bind(profile)
stats = bstats.run()
if isinstance(stats, org.apache.pig.tools.pigstats.SimplePigStats):
if not stats.isSuccessful():
print 'Error in web log stats, %s' % run.getErrorMessage()
sys.exit(1)
else:
for run in stats:
if not run.isSuccessful():
print 'Error in web log stats, %s' % run.getErrorMessage()
sys.exit(1)
示例8: import_logs
def import_logs(profile):
""" Import all the log files for a given day and processed them putting each in a log dir.
If the profile is a list there are multiple files otherwise only a single one.
The files are combined when running web_load.pig
"""
#Clean up any left over files from the last run
for logfile in profile:
Pig.fs('rmr %s/%s' % (logfile['TMPDIR'], logfile['NAME']))
pload = Pig.compileFromFile('web_import.pig')
bload = pload.bind(profile)
load = bload.run()
#Check for load errors
if isinstance(load, org.apache.pig.tools.pigstats.SimplePigStats):
if not load.isSuccessful():
print 'Error in web log load, %s' % load.getErrorMessage()
sys.exit(1)
else:
for run in load:
if not run.isSuccessful():
print 'Error in web log load, %s' % run.getErrorMessage()
sys.exit(1)
示例9: run_script
def run_script():
import os
from org.apache.pig.scripting import Pig
# compile the pig code
for i in range(10):
print 'Run %s started!' % i
P = Pig.compileFromFile("../pigscripts/avg_songs_per_split_counter.pig")
bound = P.bind({"ITERATION_NUM":i})
ps = bound.runSingle()
print 'Run %s done!' % i
result = ps.result("avg_split_song_count")
for r in result.iterator():
print r
if int(r.get(1).toString()) >= 5:
print 'Good enough! Quitting time!'
break
示例10: range
MIN_SCORE = 0
MAX_ITERATION = 5
# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
initial_centroids = initial_centroids + str(last_centroids[i])
if i!=k-1:
initial_centroids = initial_centroids + ":"
P = Pig.compile("""register udf.jar
DEFINE find_centroid FindCentroid('$centroids');
raw = load '/user/hdfs/data/data1/student.txt' as (name:chararray, age:int, gpa:double);
centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
grouped = group centroided by centroid;
result = foreach grouped generate group, AVG(centroided.gpa);
store result into 'output';
""")
converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
Q = P.bind({'centroids':initial_centroids})
results = Q.runSingle()
if results.isSuccessful() == "FAILED":
raise "Pig job failed"
iter = results.result("result").iterator()
centroids = [None] * k
distance_move = 0
# get new centroid of this iteration, caculate the moving distance with last iteration
示例11: str
print 'LOG: Elapsed %f' % (endTime - startTime)
# Remove the guardFile
fs.delete( guardFile, True )
System.exit(0)
if fs.exists( parsedDir ):
# parsed-captures
if ( not fs.exists( parsedCaptures) or
fs.getFileStatus( parsedDir ).getModificationTime() > fs.getFileStatus( parsedCaptures ).getModificationTime() ):
print 'LOG: Graph parsed-captures create'
fs.delete( parsedCaptures, True )
params = { 'INPUT' : str(parsedDir),
'OUTPUT' : str(parsedCaptures),
'JOBNAME': str(collection) + ' parsed-captures' }
job = Pig.compileFromFile( 'pig/parsed-captures.pig' ).bind( params )
result = job.runSingle(props)
if not result.isSuccessful():
print '\nERROR: Pig job parsed-captures for ' + collection
System.exit(1)
else:
print 'LOG: Graph parsed-captures up-to-date'
# link-graph
if ( not fs.exists( linkGraph ) or
fs.getFileStatus( parsedDir ).getModificationTime() > fs.getFileStatus( linkGraph ).getModificationTime() ):
print 'LOG: Graph link-graph create'
fs.delete( linkGraph, True )
params = { 'INPUT' : str(parsedDir),
'OUTPUT' : str(linkGraph),
'JOBNAME': str(collection) + ' link-graph' }
示例12: int
from math import ceil, log
from org.apache.pig.scripting import Pig
if __name__ == "__main__":
params = Pig.getParameters()
graph = params["GRAPH"]
seed_vertices = params["SEED_VERTICES"]
tmp_dir = params["TMP_DIR"]
output_path = params["OUTPUT_PATH"]
nhood_size = int(params["NEIGHBORHOOD_SIZE"])
preprocess_graph = "%s/preprocess/graph" % tmp_dir
preprocess_num_vertices = "%s/preprocess/num_vertices" % tmp_dir
iteration_verts_prefix = "%s/iteration/vertices_" % tmp_dir
print "Graph Sampler: starting preprocessing step."
preprocessing = Pig.compileFromFile("../pigscripts/graph_sampler_preprocess.pig").bind({
"GRAPH_INPUT_PATH" : graph,
"GRAPH_OUTPUT_PATH" : preprocess_graph,
"NUM_VERTICES_OUTPUT_PATH" : preprocess_num_vertices
}).runSingle()
iteration_script = Pig.compileFromFile("../pigscripts/graph_sampler_iterate.pig")
num_iterations = nhood_size - 1
num_vertices = long(str(preprocessing.result("num_vertices").iterator().next().get(0)))
print "Graph Sampler: scheduling %d iterations" % num_iterations
for i in range(num_iterations):
print "Graph Sampler: starting iteration step %d" % (i+1)
iteration = iteration_script.bind({
"VERTICES_INPUT_PATH" : seed_vertices if i == 0 else (iteration_verts_prefix + str(i-1)),
示例13: range
MIN_SCORE = 0
MAX_ITERATION = 100
# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
initial_centroids = initial_centroids + str(last_centroids[i])
if i!=k-1:
initial_centroids = initial_centroids + ":"
P = Pig.compile("""register udf.jar;
DEFINE find_centroid FindCentroid('$centroids');
raw = load 'student.txt' as (name:chararray, age:int, gpa:double);
centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
grouped = group centroided by centroid parallel 2;
result = foreach grouped generate group, AVG(centroided.gpa);
store result into 'kmoutput';
""")
converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
Q = P.bind({'centroids':initial_centroids})
results = Q.runSingle()
if results.isSuccessful() == "FAILED":
raise "Pig job failed"
iter = results.result("result").iterator()
centroids = [None] * k
distance_move = 0
# get new centroid of this iteration, caculate the moving distance with last iteration
示例14: str
par = {}
par['DATA_DATE'] = day
par['REPORT_DATE'] = "2013/07/" + str(i)
params.append(par)
prefix = "2013080"
for i in range(1, 10):
day = prefix + str(i)
par = {}
par['DATA_DATE'] = day
par['REPORT_DATE'] = "2013/08/0" + str(i)
params.append(par)
Pig.registerUDF("attribute_click.py", "myfuncs")
# ('date.range','start.date=$DATE;end.date=$DATE;error.on.missing=false');
script = """
%declare OUTPUT '/user/haliu'
member = load '$OUTPUT/JYMBII-batch/MemberList' USING BinaryJSON();
events = LOAD '/data/tracking/PageViewEvent' USING LiAvroStorage('date.range','start.date=$DATA_DATE;end.date=$DATA_DATE;error.on.missing=false');
job_view_events = FILTER events BY requestHeader.pageKey == 'jobs_seeking_view_job' AND header.memberId > 0;
job_views = FOREACH job_view_events GENERATE
(int)header.memberId AS memberId,
(long)header.time AS time,
trackingCode,
(int)trackingInfo#'0' AS jobId;
示例15: run_pagerank
def run_pagerank(edges_input,
output_path,
tmp_output_dir,
damping_factor=0.85,
convergence_threshold=0.0001,
max_num_iterations=10,
id_name_map=None,
preprocessing_script="../pigscripts/pagerank_preprocess.pig",
iteration_script="../pigscripts/pagerank_iterate.pig"
):
"""
Calculates pageranks for directed graph of nodes and edges.
Three main steps:
1. Preprocessing: Process input data to:
a) Count the total number of nodes.
b) Prepare initial pagerank values for all nodes.
2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the
nodes with edges going into the given node.
3. Postprocessing: Order nodes by pagerank
Optionally join (id, pagerank) pairs to a dataset of (id, name) pairs
to get human-readable names
"""
preprocess_dir = "%s/preprocess" % tmp_output_dir
iteration_dir = "%s/iteration" % tmp_output_dir
# Preprocessing step:
print "Starting preprocessing step."
preprocess = Pig.compileFromFile("../pigscripts/pagerank_preprocess.pig").bind({
"INPUT_PATH" : edges_input,
"PAGERANKS_OUTPUT_PATH" : "%s/pageranks" % preprocess_dir,
"NUM_NODES_OUTPUT_PATH" : "%s/num_nodes" % preprocess_dir
}).runSingle()
# Update convergence threshold based on the size of the graph (number of nodes)
num_nodes = long(str(preprocess.result("num_nodes").iterator().next().get(0)))
convergence_threshold = long(convergence_threshold * num_nodes * num_nodes)
print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold)
# Iteration step:
def iteration_param_func(it_num, it_dir):
if it_num == 1:
iteration_input = "%s/pageranks" % preprocess_dir
else:
iteration_input = "%s/%d/pageranks" % (it_dir, it_num - 1)
return {
"INPUT_PATH" : iteration_input,
"DAMPING_FACTOR" : damping_factor,
"NUM_NODES" : num_nodes,
"PAGERANKS_OUTPUT_PATH" : "%s/%d/pageranks" % (it_dir, it_num),
"AGG_RANK_CHANGE_OUTPUT_PATH" : "%s/%d/rank_changes" % (it_dir, it_num)
}
iteration_result = IterationUtils.iterate_until_convergence(
"../pigscripts/pagerank_iterate.pig", # the pigscript to iterate
iteration_dir, # temporary iteration outputs will be stored here
iteration_param_func, # takes iteration #, returns Pig parameter dictionary
"Sum of ordering-rank changes", # name of the convergence metric
int, # Python type of the convergence metric
"aggregate_rank_change", # alias in the pigscript where the metric is stored to
convergence_threshold, # stop when metric less than this
max_num_iterations # or if this many iterations have been performed
)
# Postprocesing step:
print "Starting postprocessing step."
postprocess_script = """
pageranks = LOAD '$PAGERANKS_INPUT_PATH' USING PigStorage() AS (id: int, pagerank: double);
pageranks = FILTER pageranks BY pagerank IS NOT NULL;
"""
if id_name_map:
postprocess_script += """
id_name_map = LOAD '$ID_NAME_MAP_INPUT_PATH' USING PigStorage() AS (id: int, name: chararray);
with_names = FOREACH (JOIN id_name_map BY id, pageranks BY id) GENERATE name, pagerank;
ordered = ORDER with_names BY pagerank DESC;
rmf $OUTPUT_PATH;
STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
"""
postprocess = Pig.compile(postprocess_script).bind({
"PAGERANKS_INPUT_PATH" : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
"ID_NAME_MAP_INPUT_PATH" : id_name_map,
"OUTPUT_PATH" : output_path
}).runSingle()
else:
postprocess_script += """
ordered = ORDER pageranks BY pagerank DESC;
rmf $OUTPUT_PATH;
STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
"""
postprocess = Pig.compile(postprocess_script).bind({
"PAGERANKS_INPUT_PATH" : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
"OUTPUT_PATH" : output_path
}).runSingle()
#.........这里部分代码省略.........