本文整理汇总了Python中org.apache.pig.scripting.Pig.compile方法的典型用法代码示例。如果您正苦于以下问题:Python Pig.compile方法的具体用法?Python Pig.compile怎么用?Python Pig.compile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.pig.scripting.Pig
的用法示例。
在下文中一共展示了Pig.compile方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compile [as 别名]
def __init__(self, params):
# BIND and RUN
self.params = params
self.set_param_defaults()
Pig.fs("rmr " + self.params['output_name'])
generator = PigScriptGenerator.PigScriptGenerator(self.params)
full_script = generator.generate()
P = Pig.compile( full_script )
results = P.bind({
'output':self.params['output_name'],
}).runSingle()
if results.isSuccessful() :
print 'Pig job succeeded'
else :
raise 'Pig job failed'
result_iter = results.result("final_set").iterator()
#This takes care of turning our iter into something we can use.
self.make_dict_from_results(result_iter)
send_to_grapht = raw_input('do you want to send this data to grapht?')
if send_to_grapht not in ('y', 'yes', '1'):
sys.exit()
connector = GraphtConnector('grapht.shuttercorp.net')
metric = self.params['output_name']
connector.record_data_points(metric, self.result)
示例2: AVG
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compile [as 别名]
"""
if aggregateMethod == "avg":
pigScript += """
rankedTriples = FOREACH objGroup GENERATE
$0,$1,$2,
AVG({($4 is null? 0F: $4),($6 is null? 0F: $6)}) AS ranking;"""
elif aggregateMethod == "max":
pigScript += """
rankedTriples = FOREACH objGroup GENERATE
$0,$1,$2,
MAX({($4 is null? 0F: $4),($6 is null? 0F: $6)}) AS ranking;"""
elif aggregateMethod == "min":
pigScript += """
rankedTriples = FOREACH objGroup GENERATE
$0,$1,$2,
MIN({($4 is null? 1F: $4),($6 is null? 1F: $6)}) AS ranking;"""
else:
pigScript += """
WRONGGGG. how to aggregate?!"""
pigScript += """
rmf $outputFile
STORE rankedTriples INTO '$outputFile' USING PigStorage();
"""
P = Pig.compile(pigScript)
stats = P.bind().runSingle()
示例3: str
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compile [as 别名]
vertica.accert_table_exists(table_name)
table_size = vertica.get_table_size(table_name)
logger.info(table_name + " table size is " + str(table_size) + " bytes")
output_dir = "/user/mykhail.martsynyuk/vertica/export/"+table_name
#prepare hdfs structure
logger.info("Move folder "+output_dir+" to backup")
hdfs.move_folder_to_backup(output_dir)
logger.info("Remove "+output_dir)
hdfs.remove_folder(output_dir)
params.append({'out':output_dir, 'table':table_name})
P = Pig.compile("""
register /usr/lib/pig/lib/pig-vertica.jar
register /usr/lib/pig/lib/vertica-jdbc-7.0.1-0.jar
A = LOAD 'sql://{SELECT * FROM $table WHERE 1 = ?};{1}' USING com.vertica.pig.VerticaLoader('10.104.5.29','verticadst','5433','alfxplsit','xpl123');
STORE A INTO '$out';
""")
bound = P.bind(params)
stats_list = bound.run()
i = 0
for stats in stats_list:
if stats.isSuccessful():
logger.info("SUCCESS: Table: "+params[i]["table"]+"; Number jobs: "+str(stats.getNumberJobs())+ "; Time to run: "+str(stats.getDuration())+"; Files written: "+str(stats.getOutputLocations()))
else:
logger.info("FAIL: Table: "+params[i]["table"]+"; ERRORS: "+stats.getAllErrorMessages())
i+=1
# Next is example of how to get script output:
示例4: FindCentroid
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compile [as 别名]
if i != k - 1:
initial_centroids = initial_centroids + ":"
# initial_centroids = "37.475097, -122.155599:37.486098,-122.195388:37.4985769, -122.2195727:37.4608874, -122.143838:37.453407, -122.182255"
# initial_centroids = "-120.0,-120.0:-60.0,-60.0:0.0, 0.0:60.0,60.0:120.0,120.0"
# last_centroids = [(-120.0,-120.0),(-60.0, -60.0),(0.0, 0.0),(60.0, 60.0),(120.0,120.0)]
print last_centroids
print initial_centroids
P = Pig.compile(
"""register Find.jar
DEFINE find_centroid FindCentroid('$centroids');
raw_data = load 'MP_match.txt' as (latitude:double, longitude:double, status:chararray);
centroided = foreach raw_data generate status, latitude, longitude, find_centroid(latitude, longitude) as centroid;
grouped = group centroided by centroid;
store grouped into 'grouped';
result = foreach grouped generate group, AVG(centroided.latitude), AVG(centroided.longitude);
store result into 'output';
"""
)
converged = False
iter_num = 0
while iter_num < MAX_ITERATION:
Q = P.bind({"centroids": initial_centroids})
results = Q.runSingle()
if results.isSuccessful() == "FAILED":
raise "Pig job failed"
iter = results.result("result").iterator()
centroids = []
示例5: range
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compile [as 别名]
MIN_SCORE = 0
MAX_ITERATION = 100
# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
initial_centroids = initial_centroids + str(last_centroids[i])
if i!=k-1:
initial_centroids = initial_centroids + ":"
P = Pig.compile("""register udf.jar;
DEFINE find_centroid FindCentroid('$centroids');
raw = load 'student.txt' as (name:chararray, age:int, gpa:double);
centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
grouped = group centroided by centroid parallel 2;
result = foreach grouped generate group, AVG(centroided.gpa);
store result into 'kmoutput';
""")
converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
Q = P.bind({'centroids':initial_centroids})
results = Q.runSingle()
if results.isSuccessful() == "FAILED":
raise "Pig job failed"
iter = results.result("result").iterator()
centroids = [None] * k
distance_move = 0
# get new centroid of this iteration, caculate the moving distance with last iteration
示例6: range
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compile [as 别名]
MIN_SCORE = 0
MAX_ITERATION = 5
# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
initial_centroids = initial_centroids + str(last_centroids[i])
if i!=k-1:
initial_centroids = initial_centroids + ":"
P = Pig.compile("""register udf.jar
DEFINE find_centroid FindCentroid('$centroids');
raw = load '/user/hdfs/data/data1/student.txt' as (name:chararray, age:int, gpa:double);
centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
grouped = group centroided by centroid;
result = foreach grouped generate group, AVG(centroided.gpa);
store result into 'output';
""")
converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
Q = P.bind({'centroids':initial_centroids})
results = Q.runSingle()
if results.isSuccessful() == "FAILED":
raise "Pig job failed"
iter = results.result("result").iterator()
centroids = [None] * k
distance_move = 0
# get new centroid of this iteration, caculate the moving distance with last iteration
示例7: run_pagerank
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compile [as 别名]
def run_pagerank(edges_input,
output_path,
tmp_output_dir,
damping_factor=0.85,
convergence_threshold=0.0001,
max_num_iterations=10,
id_name_map=None,
preprocessing_script="../pigscripts/pagerank_preprocess.pig",
iteration_script="../pigscripts/pagerank_iterate.pig"
):
"""
Calculates pageranks for directed graph of nodes and edges.
Three main steps:
1. Preprocessing: Process input data to:
a) Count the total number of nodes.
b) Prepare initial pagerank values for all nodes.
2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the
nodes with edges going into the given node.
3. Postprocessing: Order nodes by pagerank
Optionally join (id, pagerank) pairs to a dataset of (id, name) pairs
to get human-readable names
"""
preprocess_dir = "%s/preprocess" % tmp_output_dir
iteration_dir = "%s/iteration" % tmp_output_dir
# Preprocessing step:
print "Starting preprocessing step."
preprocess = Pig.compileFromFile("../pigscripts/pagerank_preprocess.pig").bind({
"INPUT_PATH" : edges_input,
"PAGERANKS_OUTPUT_PATH" : "%s/pageranks" % preprocess_dir,
"NUM_NODES_OUTPUT_PATH" : "%s/num_nodes" % preprocess_dir
}).runSingle()
# Update convergence threshold based on the size of the graph (number of nodes)
num_nodes = long(str(preprocess.result("num_nodes").iterator().next().get(0)))
convergence_threshold = long(convergence_threshold * num_nodes * num_nodes)
print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold)
# Iteration step:
def iteration_param_func(it_num, it_dir):
if it_num == 1:
iteration_input = "%s/pageranks" % preprocess_dir
else:
iteration_input = "%s/%d/pageranks" % (it_dir, it_num - 1)
return {
"INPUT_PATH" : iteration_input,
"DAMPING_FACTOR" : damping_factor,
"NUM_NODES" : num_nodes,
"PAGERANKS_OUTPUT_PATH" : "%s/%d/pageranks" % (it_dir, it_num),
"AGG_RANK_CHANGE_OUTPUT_PATH" : "%s/%d/rank_changes" % (it_dir, it_num)
}
iteration_result = IterationUtils.iterate_until_convergence(
"../pigscripts/pagerank_iterate.pig", # the pigscript to iterate
iteration_dir, # temporary iteration outputs will be stored here
iteration_param_func, # takes iteration #, returns Pig parameter dictionary
"Sum of ordering-rank changes", # name of the convergence metric
int, # Python type of the convergence metric
"aggregate_rank_change", # alias in the pigscript where the metric is stored to
convergence_threshold, # stop when metric less than this
max_num_iterations # or if this many iterations have been performed
)
# Postprocesing step:
print "Starting postprocessing step."
postprocess_script = """
pageranks = LOAD '$PAGERANKS_INPUT_PATH' USING PigStorage() AS (id: int, pagerank: double);
pageranks = FILTER pageranks BY pagerank IS NOT NULL;
"""
if id_name_map:
postprocess_script += """
id_name_map = LOAD '$ID_NAME_MAP_INPUT_PATH' USING PigStorage() AS (id: int, name: chararray);
with_names = FOREACH (JOIN id_name_map BY id, pageranks BY id) GENERATE name, pagerank;
ordered = ORDER with_names BY pagerank DESC;
rmf $OUTPUT_PATH;
STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
"""
postprocess = Pig.compile(postprocess_script).bind({
"PAGERANKS_INPUT_PATH" : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
"ID_NAME_MAP_INPUT_PATH" : id_name_map,
"OUTPUT_PATH" : output_path
}).runSingle()
else:
postprocess_script += """
ordered = ORDER pageranks BY pagerank DESC;
rmf $OUTPUT_PATH;
STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
"""
postprocess = Pig.compile(postprocess_script).bind({
"PAGERANKS_INPUT_PATH" : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
"OUTPUT_PATH" : output_path
}).runSingle()
#.........这里部分代码省略.........
示例8: BinaryJSON
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compile [as 别名]
# ('date.range','start.date=$DATE;end.date=$DATE;error.on.missing=false');
script = """
%declare OUTPUT '/user/haliu'
member = load '$OUTPUT/JYMBII-batch/MemberList' USING BinaryJSON();
events = LOAD '/data/tracking/PageViewEvent' USING LiAvroStorage('date.range','start.date=$DATA_DATE;end.date=$DATA_DATE;error.on.missing=false');
job_view_events = FILTER events BY requestHeader.pageKey == 'jobs_seeking_view_job' AND header.memberId > 0;
job_views = FOREACH job_view_events GENERATE
(int)header.memberId AS memberId,
(long)header.time AS time,
trackingCode,
(int)trackingInfo#'0' AS jobId;
job_views = join job_views by memberId, member by memberId;
job_views = foreach job_views generate job_views::memberId as memberId, job_views::time as time, job_views::jobId as jobId;
job_views = filter job_views by memberId > 0;
job_views = distinct job_views parallel 1;
STORE job_views INTO '$OUTPUT/JYMBII-batch/history/view/$REPORT_DATE' USING BinaryJSON('memberId');
"""
prog = Pig.compile(script)
for para in params:
bound = prog.bind(para)
stats = bound.runSingle()
print "********************************Finish Current Data " + para['DATE'] + " *************************************************"
示例9: AS
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compile [as 别名]
postString += "Result = FOREACH Result GENERATE " + fsDic['genFields'] + ";\n"
postString += "Result = FOREACH Result GENERATE * AS (" + fsDic['genSchema'] + ");\n"
# postString += "Result = FOREACH Result GENERATE " + currentAction+ "Result::UserId AS UserId, *;\n"
# postString += "DESCRIBE Result;\n"
# A1BResult = JOIN BResult BY UserId, CResult By UserId;
pigString += postString
pigString += """
DUMP Result;
DESCRIBE Result;
"""
print(pigString)
# with open('cyygeneratedPig.pig','w') as outFile:
# outFile.write(pigString)
if USE_PIG:
P = Pig.compile(pigString)
# P = Pig.compileFromFile('pig_bcd_bc.pig')
# run the pig script
if True:
result = P.bind().runSingle()
if result.isSuccessful():
print 'run success'
else:
raise 'run failed'
示例10: range
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compile [as 别名]
MIN_SCORE = 0
MAX_ITERATION = 100
# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
initial_centroids = initial_centroids + str(last_centroids[i])
if i!=k-1:
initial_centroids = initial_centroids + ":"
P = Pig.compile("""register 'centroid.py' using jython as centroid;
raw = load 'student.txt' as (name:chararray, age:int, gpa:double);
centroided = foreach raw generate gpa, centroid.get_closest_centroid(gpa, '$centroids') as centroid;
grouped = group centroided by centroid parallel 2;
result = foreach grouped generate group, AVG(centroided.gpa);
store result into 'kmoutput';
""")
converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
Q = P.bind({'centroids':initial_centroids})
results = Q.runSingle()
if results.isSuccessful() == "FAILED":
raise "Pig job failed"
iter = results.result("result").iterator()
centroids = [None] * k
distance_move = 0
# get new centroid of this iteration, caculate the moving distance with last iteration
示例11: str
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compile [as 别名]
# initial_centroids = initial_centroids + str(last_centroids[i])
# if i!=k-1:
# initial_centroids = initial_centroids + ":"
initial_centroids = "-120.0,-120.0:-60.0,-60.0:0.0, 0.0:60.0,60.0:120.0,120.0"
last_centroids = [(-120.0,-120.0),(-60.0, -60.0),(0.0, 0.0),(60.0, 60.0),(120.0,120.0)]
print initial_centroids
P = Pig.compile("""register /Users/yun_shen/Desktop/spams/pigudf.jar
DEFINE find_centroid FindCentroid('$centroids');
raw_data = load '1.log' as (spam_id:chararray, longitude:double, latitude:double);
raw = filter raw_data by longitude is not null and latitude is not null;
centroided = foreach raw generate spam_id, longitude, latitude, find_centroid(longitude, latitude) as centroid;
grouped = group centroided by centroid parallel 4;
store grouped into 'grouped';
result = foreach grouped generate group, AVG(centroided.longitude), AVG(centroided.latitude);
store result into 'output';
""")
converged = False
iter_num = 0
while iter_num < MAX_ITERATION:
Q = P.bind({'centroids':initial_centroids})
results = Q.runSingle()
if results.isSuccessful() == "FAILED":
raise "Pig job failed"
iter = results.result("result").iterator()
centroids = []
x = 0.0
示例12: count_ngrams
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compile [as 别名]
from org.apache.pig.scripting import Pig
# if output path does not exist, create it
if Pig.fs('-test -d ' + _out):
Pig.fs('mkdir ' + _out)
##
# CountJob
#
# if output path of countjob already exists, skip it, run job
##
if not Pig.fs('-test -d ' + _out_nc):
print '\nPath ("%s") already exists, skipping job.\n' % _out_nc
else:
result = Pig.compile(_header + """
count_ngrams( '${in}', '${out}', '${min_count}' );
""").bind({'in':_in, 'out':_out_nc, 'min_count': _min_count, 'n':'count-ngrams'}).runSingle()
# check the result
if not result.isSuccessful():
raise "Pig job failed"
##
# ExtractVocabularyJob
#
# if output path of countjob already exists, skip it, run job
##
if not Pig.fs('-test -d ' + _out_v):
print '\nPath ("%s") already exists, skipping job.\n' % _out_v
else:
result = Pig.compile(_header + """
示例13: AS
# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compile [as 别名]
P = Pig.compile("""
previous_pagerank =
LOAD '$docs_in'
AS ( url: $inputType, pagerank: float, links:{ link: ( url: $inputType ) } );
/**
Creates
<http://rdf.chemspider.com/3442>, 1.0, {(<http://www.w3.org/2004/02/skos/core#exactMatch>), (<http://bla>)}
*/
outbound_pagerank =
FOREACH previous_pagerank
GENERATE
pagerank / COUNT ( links ) AS pagerank,
FLATTEN ( links ) AS to_url;
/**
Creates:
1.0, <http://bla>
1.0, <http://www.w3.org/2004/02/skos/core#exactMatch>
*/
cogrpd = cogroup outbound_pagerank by to_url, previous_pagerank by url;
/**
creates:
<http://rdf.chemspider.com/3442>, {}, {(<http://rdf.chemspider.com/3442>, 1.0, {(<http://www.w3.org/2004/02/skos/core#exactMatch>), (<http://bla>)})}
*/
new_pagerank =
FOREACH
cogrpd
GENERATE
group AS url,
( 1 - $d ) + $d * SUM (outbound_pagerank.pagerank) AS pagerank,
FLATTEN ( previous_pagerank.links ) AS links,
FLATTEN ( previous_pagerank.pagerank ) AS previous_pagerank;
STORE new_pagerank
INTO '$docs_out';
nonulls = filter new_pagerank by previous_pagerank is not null and pagerank is not null;
pagerank_diff = FOREACH nonulls GENERATE ABS ( previous_pagerank - pagerank );
grpall = group pagerank_diff all;
max_diff = foreach grpall generate MAX (pagerank_diff);
STORE max_diff INTO '$max_diff';
""")