当前位置: 首页>>代码示例>>Python>>正文


Python scripting.Pig类代码示例

本文整理汇总了Python中org.apache.pig.scripting.Pig的典型用法代码示例。如果您正苦于以下问题:Python Pig类的具体用法?Python Pig怎么用?Python Pig使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了Pig类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

 def __init__(self, params):
     # BIND and RUN
     self.params = params
     self.set_param_defaults()
     Pig.fs("rmr " + self.params['output_name'])
     generator = PigScriptGenerator.PigScriptGenerator(self.params)
     full_script = generator.generate()
     
     P = Pig.compile( full_script )
     
     results = P.bind({
                           'output':self.params['output_name'],
                           }).runSingle()
     
     if results.isSuccessful() :
         print 'Pig job succeeded'
     else :
         raise 'Pig job failed'
     result_iter = results.result("final_set").iterator()
     
     #This takes care of turning our iter into something we can use.
     self.make_dict_from_results(result_iter)
     
     send_to_grapht = raw_input('do you want to send this data to grapht?')
     if send_to_grapht not in ('y', 'yes', '1'): 
         sys.exit()
     connector = GraphtConnector('grapht.shuttercorp.net')
     metric = self.params['output_name']
     connector.record_data_points(metric, self.result)
     
开发者ID:EliFinkelshteyn,项目名称:Lipstick,代码行数:29,代码来源:PigRunner.py

示例2: __init__

 def __init__ (self, jars = [], properties = {}):
     ''' Initialize Pig. '''
     for jar in jars:
         logger.debug (" >>> register jar: %s", jar)
         Pig.registerJar (jar)
     for key in properties:
         logger.debug (" >>> set property: %s => %s", key, properties[key])
         Pig.set (key, properties [key]) 
开发者ID:dacornej,项目名称:crcsim,代码行数:8,代码来源:geocode.py

示例3: run

    def run(self):
        print "%s: %s" % (self.script_name, self.description)
        stats = self.bound_script.runSingle()

        if stats.isSuccessful():
            Pig.fs("touchz %s" % self.flag_file_path)
        else:
            raise Exception("\nScript %s failed! Error should be logged above.\n" % self.script_name +
                            "Once you have fixed the problem, you can restart the workflow at this step " +
                            "using the argument \"-p CHECKPOINT=%s\"" % self.script_name)
开发者ID:mortardata,项目名称:bacon-bits,代码行数:10,代码来源:pig_workflow.py

示例4: runbidi

def runbidi(src, fdest):
	P = Pig.compileFromFile('src/main/pig/bidi.pig')

	cntsbase = 'counts'
	Pig.fs('rmr ' + cntsbase)

	for count in range(10):
		dest = fdest + 'gm%04d' % count
		Pig.fs('rmr ' + dest)
		cnts = cntsbase
		params = {'src':src, 'dest':dest, 'cnts':cnts}
		bound = P.bind(params)
		job = bound.runSingle()

		if not job.isSuccessful():
			raise 'failed'

		src = dest

		iter = job.result('S').iterator()
		if iter.hasNext():
			Pig.fs('rmr ' + cnts)
		else:
			Pig.fs('mv ' + dest + ' ' + fdest)
			print 'ALL DONE!'
			break
开发者ID:cdcttr,项目名称:similarity-engine,代码行数:26,代码来源:bidipig.py

示例5: run

 def run (self, params, script_name, script_file, elements = []):
     ''' Execute pig. '''
     pig = Pig.compileFromFile (script_name, script_file)
     bound = pig.bind (params) 
     futures = bound.run () if isinstance (params, list) else bound.runSingle ()
     self.handle_future (futures, elements)
     self.complete ()
开发者ID:dacornej,项目名称:crcsim,代码行数:7,代码来源:geocode.py

示例6: run_script

def run_script():
    import os
    from org.apache.pig.scripting import Pig

    # compile the pig code
    P = Pig.compileFromFile("../pigscripts/#{script_name}.pig")
    bound = P.bind()
    bound.runSingle()
开发者ID:AsherBond,项目名称:mortar,代码行数:8,代码来源:controlscript.py

示例7: main

def main(argv=None):
#Ideally I want to use arguments, ie 'pig -l /var/log/pig web_process.py /etc/rgpig/www.iresis.com.py daily'
#however it just doesn't work, I'm not sure why the code has been applied in my version, and I can get it to
#work with a test .py that only has two lines, import sys, and print sys.argv. Here is the case
#https://issues.apache.org/jira/browse/PIG-2548
#    if argv is None:
#        argv = sys.argv
#    if len(argv) != 3:
#        print "Usage: " + argv[0] + " <profile config> <daily|weekly|monthly>"
#        return 1
#
#    profile_file = argv[1]
#    timeframe = argv[2]
    
    profile_file = os.environ['config_file']
    timeframe = os.environ['timeframe']

    if not (timeframe == 'daily' or timeframe == 'weekly' or timeframe == 'monthly'):
        print 'The time frame must be either daily, weekly or monthly.'
        return 1

    #Load the config
    profile = {}
    execfile(profile_file, {'timeframe':timeframe}, profile)

    #Clean up incomplete runs and create dir
    Pig.fs('rmr ' + profile['REPORTDIR'])
    Pig.fs('mkdir ' + profile['REPORTDIR'])

    #Start pig processing
    pig_init()
    if timeframe == 'daily':
        #Clean up incomplete runs and create dir
        Pig.fs('rmr %s' % profile['LOGDIR'])
        Pig.fs('mkdir %s' % profile['LOGDIR'])
        import_logs(profile['logs'])
    #The web_load.pig script is run by the processing scripts
    pstats = Pig.compileFromFile('web_%s.pig' % timeframe)
    bstats = pstats.bind(profile)
    stats = bstats.run()
    if isinstance(stats, org.apache.pig.tools.pigstats.SimplePigStats):
        if not stats.isSuccessful():
            print 'Error in web log stats, %s' % run.getErrorMessage()
            sys.exit(1)
    else:
        for run in stats:
            if not run.isSuccessful():
                print 'Error in web log stats, %s' % run.getErrorMessage()
                sys.exit(1)
开发者ID:tkuhlman,项目名称:pigfeed,代码行数:49,代码来源:web_process.py

示例8: import_logs

def import_logs(profile):
    """ Import all the log files for a given day and processed them putting each in a log dir.
        If the profile is a list there are multiple files otherwise only a single one.
        The files are combined when running web_load.pig
    """
    #Clean up any left over files from the last run
    for logfile in profile:
        Pig.fs('rmr %s/%s' % (logfile['TMPDIR'], logfile['NAME']))
    pload = Pig.compileFromFile('web_import.pig')
    bload = pload.bind(profile)
    load = bload.run()
    #Check for load errors
    if isinstance(load, org.apache.pig.tools.pigstats.SimplePigStats):
        if not load.isSuccessful():
            print 'Error in web log load, %s' % load.getErrorMessage()
            sys.exit(1)
    else:
        for run in load:
            if not run.isSuccessful():
                print 'Error in web log load, %s' % run.getErrorMessage()
                sys.exit(1)
开发者ID:tkuhlman,项目名称:pigfeed,代码行数:21,代码来源:web_process.py

示例9: run_script

def run_script():
    import os
    from org.apache.pig.scripting import Pig

    # compile the pig code
    for i in range(10):
        print 'Run %s started!' % i
        P = Pig.compileFromFile("../pigscripts/avg_songs_per_split_counter.pig")

        bound = P.bind({"ITERATION_NUM":i})

        ps = bound.runSingle()
        print 'Run %s done!' % i

        result = ps.result("avg_split_song_count")
        for r in result.iterator():
            print r

        if int(r.get(1).toString()) >= 5:
            print 'Good enough! Quitting time!'
            break
开发者ID:mortardata,项目名称:load-grouper-example,代码行数:21,代码来源:iterate-example.py

示例10: range

MIN_SCORE = 0
MAX_ITERATION = 5

# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
    last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
    initial_centroids = initial_centroids + str(last_centroids[i])
    if i!=k-1:
        initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar
                   DEFINE find_centroid FindCentroid('$centroids');
                   raw = load '/user/hdfs/data/data1/student.txt' as (name:chararray, age:int, gpa:double);
                   centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
                   grouped = group centroided by centroid;
                   result = foreach grouped generate group, AVG(centroided.gpa);
                   store result into 'output';
                """)

converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
    Q = P.bind({'centroids':initial_centroids})
    results = Q.runSingle()
    if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = [None] * k
    distance_move = 0
    # get new centroid of this iteration, caculate the moving distance with last iteration
开发者ID:HortonworksUniversity,项目名称:Admin_Rev3,代码行数:32,代码来源:kmeans.py

示例11: str

    print 'LOG: Elapsed %f' % (endTime - startTime)
    # Remove the guardFile
    fs.delete( guardFile, True )
    System.exit(0)

if fs.exists( parsedDir ):

    # parsed-captures
    if ( not fs.exists( parsedCaptures) or
         fs.getFileStatus( parsedDir ).getModificationTime() > fs.getFileStatus( parsedCaptures ).getModificationTime() ):
        print 'LOG: Graph parsed-captures create'
        fs.delete( parsedCaptures, True )
        params = { 'INPUT'  : str(parsedDir),
                   'OUTPUT' : str(parsedCaptures),
                   'JOBNAME': str(collection) + ' parsed-captures' }
        job = Pig.compileFromFile( 'pig/parsed-captures.pig' ).bind( params )
        result = job.runSingle(props)
        if not result.isSuccessful():
            print '\nERROR: Pig job parsed-captures for ' + collection
            System.exit(1)
    else:
        print 'LOG: Graph parsed-captures up-to-date'

    # link-graph
    if ( not fs.exists( linkGraph ) or
         fs.getFileStatus( parsedDir ).getModificationTime() > fs.getFileStatus( linkGraph ).getModificationTime() ):
        print 'LOG: Graph link-graph create'
        fs.delete( linkGraph, True )
        params = { 'INPUT'  : str(parsedDir),
                   'OUTPUT' : str(linkGraph),
                   'JOBNAME': str(collection) + ' link-graph' }
开发者ID:aaronbinns,项目名称:waimea,代码行数:31,代码来源:build.py

示例12: int

from math                     import ceil, log
from org.apache.pig.scripting import Pig

if __name__ == "__main__":
    params        = Pig.getParameters()
    graph         = params["GRAPH"]
    seed_vertices = params["SEED_VERTICES"]
    tmp_dir       = params["TMP_DIR"]
    output_path   = params["OUTPUT_PATH"]
    nhood_size    = int(params["NEIGHBORHOOD_SIZE"])

    preprocess_graph        = "%s/preprocess/graph"        % tmp_dir
    preprocess_num_vertices = "%s/preprocess/num_vertices" % tmp_dir
    iteration_verts_prefix  = "%s/iteration/vertices_"     % tmp_dir

    print "Graph Sampler: starting preprocessing step."
    preprocessing = Pig.compileFromFile("../pigscripts/graph_sampler_preprocess.pig").bind({
        "GRAPH_INPUT_PATH"         : graph,
        "GRAPH_OUTPUT_PATH"        : preprocess_graph,
        "NUM_VERTICES_OUTPUT_PATH" : preprocess_num_vertices
    }).runSingle()

    iteration_script = Pig.compileFromFile("../pigscripts/graph_sampler_iterate.pig")
    num_iterations   = nhood_size - 1
    num_vertices     = long(str(preprocessing.result("num_vertices").iterator().next().get(0)))

    print "Graph Sampler: scheduling %d iterations" % num_iterations
    for i in range(num_iterations):
        print "Graph Sampler: starting iteration step %d" % (i+1)
        iteration = iteration_script.bind({
            "VERTICES_INPUT_PATH"  : seed_vertices if i == 0 else (iteration_verts_prefix + str(i-1)),
开发者ID:mortardata,项目名称:bacon-bits,代码行数:31,代码来源:graph_sampler.py

示例13: range

MIN_SCORE = 0
MAX_ITERATION = 100

# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
    last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
    initial_centroids = initial_centroids + str(last_centroids[i])
    if i!=k-1:
        initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar;
                   DEFINE find_centroid FindCentroid('$centroids');
                   raw = load 'student.txt' as (name:chararray, age:int, gpa:double);
                   centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
                   grouped = group centroided by centroid parallel 2;
                   result = foreach grouped generate group, AVG(centroided.gpa);
                   store result into 'kmoutput';
                """)

converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
    Q = P.bind({'centroids':initial_centroids})
    results = Q.runSingle()
    if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = [None] * k
    distance_move = 0
    # get new centroid of this iteration, caculate the moving distance with last iteration
开发者ID:mikebin,项目名称:hdp-demos,代码行数:32,代码来源:kmeans.py

示例14: str

        par = {}
        par['DATA_DATE'] = day
        par['REPORT_DATE'] = "2013/07/" + str(i) 
        params.append(par) 
    
    
    prefix = "2013080"
    for i in range(1, 10):
        day = prefix + str(i)
        par = {}
        par['DATA_DATE'] = day
        par['REPORT_DATE'] = "2013/08/0" + str(i) 
        params.append(par) 
    
    
    Pig.registerUDF("attribute_click.py", "myfuncs")

#     ('date.range','start.date=$DATE;end.date=$DATE;error.on.missing=false');

    script = """
%declare OUTPUT '/user/haliu'
member = load '$OUTPUT/JYMBII-batch/MemberList' USING BinaryJSON();
events = LOAD '/data/tracking/PageViewEvent' USING LiAvroStorage('date.range','start.date=$DATA_DATE;end.date=$DATA_DATE;error.on.missing=false');

job_view_events = FILTER events BY requestHeader.pageKey == 'jobs_seeking_view_job' AND header.memberId > 0;

job_views = FOREACH job_view_events GENERATE 
  (int)header.memberId   AS memberId,
  (long)header.time      AS time,
  trackingCode,
  (int)trackingInfo#'0'  AS jobId;
开发者ID:olittle,项目名称:cata-offline,代码行数:31,代码来源:ViewPairs.py

示例15: run_pagerank

    def run_pagerank(edges_input,
                     output_path,
                     tmp_output_dir,
                     damping_factor=0.85,
                     convergence_threshold=0.0001,
                     max_num_iterations=10,
                     id_name_map=None,
                     preprocessing_script="../pigscripts/pagerank_preprocess.pig",
                     iteration_script="../pigscripts/pagerank_iterate.pig"
                    ):

        """
        Calculates pageranks for directed graph of nodes and edges.

        Three main steps:
            1. Preprocessing: Process input data to:
                 a) Count the total number of nodes.
                 b) Prepare initial pagerank values for all nodes.
            2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the
                          nodes with edges going into the given node.
            3. Postprocessing: Order nodes by pagerank
                               Optionally join (id, pagerank) pairs to a dataset of (id, name) pairs
                               to get human-readable names
        """

        preprocess_dir = "%s/preprocess" % tmp_output_dir
        iteration_dir  = "%s/iteration"  % tmp_output_dir

        # Preprocessing step:
        print "Starting preprocessing step."
        preprocess = Pig.compileFromFile("../pigscripts/pagerank_preprocess.pig").bind({
            "INPUT_PATH"            : edges_input,
            "PAGERANKS_OUTPUT_PATH" : "%s/pageranks" % preprocess_dir,
            "NUM_NODES_OUTPUT_PATH" : "%s/num_nodes" % preprocess_dir
        }).runSingle()

        # Update convergence threshold based on the size of the graph (number of nodes)
        num_nodes             = long(str(preprocess.result("num_nodes").iterator().next().get(0)))
        convergence_threshold = long(convergence_threshold * num_nodes * num_nodes)
        print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold) 

        # Iteration step:
        def iteration_param_func(it_num, it_dir):
            if it_num == 1:
                iteration_input = "%s/pageranks" % preprocess_dir
            else:
                iteration_input = "%s/%d/pageranks" % (it_dir, it_num - 1)

            return {
                "INPUT_PATH"                  : iteration_input,
                "DAMPING_FACTOR"              : damping_factor,
                "NUM_NODES"                   : num_nodes,
                "PAGERANKS_OUTPUT_PATH"       : "%s/%d/pageranks"    % (it_dir, it_num),
                "AGG_RANK_CHANGE_OUTPUT_PATH" : "%s/%d/rank_changes" % (it_dir, it_num)
            }

        iteration_result = IterationUtils.iterate_until_convergence(
            "../pigscripts/pagerank_iterate.pig", # the pigscript to iterate
            iteration_dir,                        # temporary iteration outputs will be stored here
            iteration_param_func,                 # takes iteration #, returns Pig parameter dictionary
            "Sum of ordering-rank changes",       # name of the convergence metric
            int,                                  # Python type of the convergence metric
            "aggregate_rank_change",              # alias in the pigscript where the metric is stored to
            convergence_threshold,                # stop when metric less than this
            max_num_iterations                    # or if this many iterations have been performed
        )

        # Postprocesing step:
        print "Starting postprocessing step."

        postprocess_script = """
            pageranks   =   LOAD '$PAGERANKS_INPUT_PATH'   USING PigStorage() AS (id: int, pagerank: double);
            pageranks   =   FILTER pageranks BY pagerank IS NOT NULL;
        """

        if id_name_map:
            postprocess_script += """
                id_name_map =   LOAD '$ID_NAME_MAP_INPUT_PATH' USING PigStorage() AS (id: int, name: chararray);
                with_names  =   FOREACH (JOIN id_name_map BY id, pageranks BY id) GENERATE name, pagerank;
                ordered     =   ORDER with_names BY pagerank DESC;
                rmf $OUTPUT_PATH;
                STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
            """

            postprocess = Pig.compile(postprocess_script).bind({
                "PAGERANKS_INPUT_PATH"   : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
                "ID_NAME_MAP_INPUT_PATH" : id_name_map,
                "OUTPUT_PATH"            : output_path
            }).runSingle()
        else:
            postprocess_script += """
                ordered     =   ORDER pageranks BY pagerank DESC;
                rmf $OUTPUT_PATH;
                STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
            """

            postprocess = Pig.compile(postprocess_script).bind({
                "PAGERANKS_INPUT_PATH"   : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
                "OUTPUT_PATH"            : output_path
            }).runSingle()
#.........这里部分代码省略.........
开发者ID:mortardata,项目名称:bacon-bits,代码行数:101,代码来源:pagerank_lib.py


注:本文中的org.apache.pig.scripting.Pig类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。