当前位置: 首页>>代码示例>>Python>>正文


Python Pig.compileFromFile方法代码示例

本文整理汇总了Python中org.apache.pig.scripting.Pig.compileFromFile方法的典型用法代码示例。如果您正苦于以下问题:Python Pig.compileFromFile方法的具体用法?Python Pig.compileFromFile怎么用?Python Pig.compileFromFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.pig.scripting.Pig的用法示例。


在下文中一共展示了Pig.compileFromFile方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: runbidi

# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
def runbidi(src, fdest):
	P = Pig.compileFromFile('src/main/pig/bidi.pig')

	cntsbase = 'counts'
	Pig.fs('rmr ' + cntsbase)

	for count in range(10):
		dest = fdest + 'gm%04d' % count
		Pig.fs('rmr ' + dest)
		cnts = cntsbase
		params = {'src':src, 'dest':dest, 'cnts':cnts}
		bound = P.bind(params)
		job = bound.runSingle()

		if not job.isSuccessful():
			raise 'failed'

		src = dest

		iter = job.result('S').iterator()
		if iter.hasNext():
			Pig.fs('rmr ' + cnts)
		else:
			Pig.fs('mv ' + dest + ' ' + fdest)
			print 'ALL DONE!'
			break
开发者ID:cdcttr,项目名称:similarity-engine,代码行数:28,代码来源:bidipig.py

示例2: run

# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
 def run (self, params, script_name, script_file, elements = []):
     ''' Execute pig. '''
     pig = Pig.compileFromFile (script_name, script_file)
     bound = pig.bind (params) 
     futures = bound.run () if isinstance (params, list) else bound.runSingle ()
     self.handle_future (futures, elements)
     self.complete ()
开发者ID:dacornej,项目名称:crcsim,代码行数:9,代码来源:geocode.py

示例3: run_script

# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
def run_script():
    import os
    from org.apache.pig.scripting import Pig

    # compile the pig code
    P = Pig.compileFromFile("../pigscripts/#{script_name}.pig")
    bound = P.bind()
    bound.runSingle()
开发者ID:AsherBond,项目名称:mortar,代码行数:10,代码来源:controlscript.py

示例4: main

# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
def main(argv=None):
#Ideally I want to use arguments, ie 'pig -l /var/log/pig web_process.py /etc/rgpig/www.iresis.com.py daily'
#however it just doesn't work, I'm not sure why the code has been applied in my version, and I can get it to
#work with a test .py that only has two lines, import sys, and print sys.argv. Here is the case
#https://issues.apache.org/jira/browse/PIG-2548
#    if argv is None:
#        argv = sys.argv
#    if len(argv) != 3:
#        print "Usage: " + argv[0] + " <profile config> <daily|weekly|monthly>"
#        return 1
#
#    profile_file = argv[1]
#    timeframe = argv[2]
    
    profile_file = os.environ['config_file']
    timeframe = os.environ['timeframe']

    if not (timeframe == 'daily' or timeframe == 'weekly' or timeframe == 'monthly'):
        print 'The time frame must be either daily, weekly or monthly.'
        return 1

    #Load the config
    profile = {}
    execfile(profile_file, {'timeframe':timeframe}, profile)

    #Clean up incomplete runs and create dir
    Pig.fs('rmr ' + profile['REPORTDIR'])
    Pig.fs('mkdir ' + profile['REPORTDIR'])

    #Start pig processing
    pig_init()
    if timeframe == 'daily':
        #Clean up incomplete runs and create dir
        Pig.fs('rmr %s' % profile['LOGDIR'])
        Pig.fs('mkdir %s' % profile['LOGDIR'])
        import_logs(profile['logs'])
    #The web_load.pig script is run by the processing scripts
    pstats = Pig.compileFromFile('web_%s.pig' % timeframe)
    bstats = pstats.bind(profile)
    stats = bstats.run()
    if isinstance(stats, org.apache.pig.tools.pigstats.SimplePigStats):
        if not stats.isSuccessful():
            print 'Error in web log stats, %s' % run.getErrorMessage()
            sys.exit(1)
    else:
        for run in stats:
            if not run.isSuccessful():
                print 'Error in web log stats, %s' % run.getErrorMessage()
                sys.exit(1)
开发者ID:tkuhlman,项目名称:pigfeed,代码行数:51,代码来源:web_process.py

示例5: import_logs

# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
def import_logs(profile):
    """ Import all the log files for a given day and processed them putting each in a log dir.
        If the profile is a list there are multiple files otherwise only a single one.
        The files are combined when running web_load.pig
    """
    #Clean up any left over files from the last run
    for logfile in profile:
        Pig.fs('rmr %s/%s' % (logfile['TMPDIR'], logfile['NAME']))
    pload = Pig.compileFromFile('web_import.pig')
    bload = pload.bind(profile)
    load = bload.run()
    #Check for load errors
    if isinstance(load, org.apache.pig.tools.pigstats.SimplePigStats):
        if not load.isSuccessful():
            print 'Error in web log load, %s' % load.getErrorMessage()
            sys.exit(1)
    else:
        for run in load:
            if not run.isSuccessful():
                print 'Error in web log load, %s' % run.getErrorMessage()
                sys.exit(1)
开发者ID:tkuhlman,项目名称:pigfeed,代码行数:23,代码来源:web_process.py

示例6: run_script

# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
def run_script():
    import os
    from org.apache.pig.scripting import Pig

    # compile the pig code
    for i in range(10):
        print 'Run %s started!' % i
        P = Pig.compileFromFile("../pigscripts/avg_songs_per_split_counter.pig")

        bound = P.bind({"ITERATION_NUM":i})

        ps = bound.runSingle()
        print 'Run %s done!' % i

        result = ps.result("avg_split_song_count")
        for r in result.iterator():
            print r

        if int(r.get(1).toString()) >= 5:
            print 'Good enough! Quitting time!'
            break
开发者ID:mortardata,项目名称:load-grouper-example,代码行数:23,代码来源:iterate-example.py

示例7: run

# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
 def run(self):
     print project_name + ": " + self.action
     compiled = Pig.compileFromFile(self.script)
     bound = compiled.bind(self.params)
     return bound.runSingle()
开发者ID:ChappedSky,项目名称:gitrec,代码行数:7,代码来源:github_recommender.py

示例8: run_pagerank

# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
    def run_pagerank(self):
        """
        Calculates pageranks for directed graph of nodes and edges.

        Three main steps:
            1. Preprocessing: Process input data to:
                 a) Count the total number of nodes.
                 b) Prepare initial pagerank values for all nodes.
            2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the
                          nodes with edges going into the given node.
            3. Postprocessing: Find the top pagerank nodes and join to a separate dataset to find their names.
        """
        # Preprocessing step:
        print "Starting preprocessing step."
        preprocess = Pig.compileFromFile(self.preprocessing_script)
        preprocess_params = {
            "INPUT_PATH": self.edges_input,
            "PAGERANKS_OUTPUT_PATH": self.preprocess_pageranks,
            "NUM_NODES_OUTPUT_PATH": self.preprocess_num_nodes
        }
        preprocess_bound = preprocess.bind(preprocess_params)
        preprocess_stats = preprocess_bound.runSingle()

        # Update convergence threshold based on the size of the graph (number of nodes)
        num_nodes = long(str(preprocess_stats.result("num_nodes").iterator().next().get(0)))
        convergence_threshold = long(self.convergence_threshold * num_nodes * num_nodes)
        print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold) 

        # Iteration step:
        iteration = Pig.compileFromFile(self.iteration_script)
        for i in range(self.max_num_iterations):
            print "Starting iteration step: %s" % str(i + 1)

            # Append the iteration number to the input/output stems
            iteration_input = self.preprocess_pageranks if i == 0 else (self.iteration_pageranks_prefix + str(i-1))
            iteration_pageranks_output = self.iteration_pageranks_prefix + str(i)
            iteration_rank_changes_output = self.iteration_rank_changes_prefix + str(i)

            iteration_bound = iteration.bind({
                "INPUT_PATH": iteration_input,
                "DAMPING_FACTOR": self.damping_factor,
                "NUM_NODES": num_nodes,
                "PAGERANKS_OUTPUT_PATH": iteration_pageranks_output,
                "AGG_RANK_CHANGE_OUTPUT_PATH": iteration_rank_changes_output
            })
            iteration_stats = iteration_bound.runSingle()

            # If we're below the convergence threshold break out of the loop.
            aggregate_rank_change = long(str(iteration_stats.result("aggregate_rank_change").iterator().next().get(0)))
            if aggregate_rank_change < convergence_threshold:
                print "Sum of ordering-rank changes %d under convergence threshold %d. Stopping." \
                       % (aggregate_rank_change, convergence_threshold)
                break
            elif i == self.max_num_iterations-1:
                print ("Sum of ordering-rank changes %d " % aggregate_rank_change) + \
                      ("above convergence threshold %d but hit max number of iterations. " % convergence_threshold) + \
                       "Stopping."
            else:
                print "Sum of ordering-rank changes %d above convergence threshold %d. Continuing." \
                       % (aggregate_rank_change, convergence_threshold)

        iteration_pagerank_result = self.iteration_pageranks_prefix + str(i)

        # Postprocesing step:
        print "Starting postprocessing step."
        postprocess = Pig.compileFromFile(self.postprocessing_script)
        postprocess_params = { "PAGERANKS_INPUT_PATH": iteration_pagerank_result }
        if self.output_path is not None: # otherwise, the script outputs to the default location,
                                         # which is a special directory in s3://mortar-example-output-data
                                         # permissioned for your Mortar account.
            postprocess_params["OUTPUT_PATH"] = self.output_path
        postprocess_bound = postprocess.bind(postprocess_params)
        postprocess_stats = postprocess_bound.runSingle()
开发者ID:arunpn,项目名称:mortar-pagerank,代码行数:75,代码来源:pagerank_lib.py

示例9: str

# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
import sys
from org.apache.pig.scripting import Pig

load = Pig.compileFromFile(sys.argv[1])
iteration = Pig.compileFromFile('iteration.pig')
store = Pig.compileFromFile('store.pig')
 
print '*** Loading input ***' 
load_stats = load.bind({'EDGES_OUT': 'edges0.tmp'}).runSingle()
if not load_stats.isSuccessful():
    raise 'Load failed'

i = 1
stable_inerations = 0
edges_in = 'edges' + str(i - 1) + '.tmp'
edges_out = ''

while True:
    print "*** Iteration " + str(i) + " ***"
    edges_out = 'edges' + str(i) + '.tmp'
    iteration_bound = iteration.bind({'EDGES_IN': edges_in, 'EDGES_OUT': edges_out, 
        'CONVERGENCE_OUT': 'convergence.tmp'})
    iteration_stats = iteration_bound.runSingle()
    if not iteration_stats.isSuccessful():
        raise 'Iteration failed'
    conv_result = iteration_stats.result('convergence').iterator().next()
    max_iter = int(str(conv_result.get(0)))
    conv_iter = int(str(conv_result.get(1)))
    change_count = int(str(conv_result.get(2)))
    Pig.fs('rm -r ' + 'convergence.tmp')
    Pig.fs('rm -r ' + edges_in)
开发者ID:AndreyLogvinov,项目名称:ap-pig,代码行数:33,代码来源:workflow.py

示例10: runbidi

# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
#!/usr/bin/python
import sys
from org.apache.pig.scripting import Pig
from bidipig import runbidi

# make minhash clusters
minhash = Pig.compileFromFile('src/main/pig/minhash.pig')

osrc = src = sys.argv[1]
destminhash = sys.argv[2] + '-minhash'
dest = sys.argv[2] + '-jaccard'
minjaccard = 80

bound = minhash.bind()

job = bound.runSingle()

if not job.isSuccessful():
	raise 'failed in minhash'
# output is pairs and scores

# make transitive closure of clusters
src = dest
dest = sys.argv[2] + '-bidi'
runbidi(src, dest)

# join with original data
join = Pig.compileFromFile('src/main/pig/join.pig')

src = osrc
keys = dest
开发者ID:cdcttr,项目名称:similarity-engine,代码行数:33,代码来源:lsh.pig.py

示例11: str

# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
    print 'LOG: Elapsed %f' % (endTime - startTime)
    # Remove the guardFile
    fs.delete( guardFile, True )
    System.exit(0)

if fs.exists( parsedDir ):

    # parsed-captures
    if ( not fs.exists( parsedCaptures) or
         fs.getFileStatus( parsedDir ).getModificationTime() > fs.getFileStatus( parsedCaptures ).getModificationTime() ):
        print 'LOG: Graph parsed-captures create'
        fs.delete( parsedCaptures, True )
        params = { 'INPUT'  : str(parsedDir),
                   'OUTPUT' : str(parsedCaptures),
                   'JOBNAME': str(collection) + ' parsed-captures' }
        job = Pig.compileFromFile( 'pig/parsed-captures.pig' ).bind( params )
        result = job.runSingle(props)
        if not result.isSuccessful():
            print '\nERROR: Pig job parsed-captures for ' + collection
            System.exit(1)
    else:
        print 'LOG: Graph parsed-captures up-to-date'

    # link-graph
    if ( not fs.exists( linkGraph ) or
         fs.getFileStatus( parsedDir ).getModificationTime() > fs.getFileStatus( linkGraph ).getModificationTime() ):
        print 'LOG: Graph link-graph create'
        fs.delete( linkGraph, True )
        params = { 'INPUT'  : str(parsedDir),
                   'OUTPUT' : str(linkGraph),
                   'JOBNAME': str(collection) + ' link-graph' }
开发者ID:aaronbinns,项目名称:waimea,代码行数:33,代码来源:build.py

示例12: run_pagerank

# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
    def run_pagerank(edges_input,
                     output_path,
                     tmp_output_dir,
                     damping_factor=0.85,
                     convergence_threshold=0.0001,
                     max_num_iterations=10,
                     id_name_map=None,
                     preprocessing_script="../pigscripts/pagerank_preprocess.pig",
                     iteration_script="../pigscripts/pagerank_iterate.pig"
                    ):

        """
        Calculates pageranks for directed graph of nodes and edges.

        Three main steps:
            1. Preprocessing: Process input data to:
                 a) Count the total number of nodes.
                 b) Prepare initial pagerank values for all nodes.
            2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the
                          nodes with edges going into the given node.
            3. Postprocessing: Order nodes by pagerank
                               Optionally join (id, pagerank) pairs to a dataset of (id, name) pairs
                               to get human-readable names
        """

        preprocess_dir = "%s/preprocess" % tmp_output_dir
        iteration_dir  = "%s/iteration"  % tmp_output_dir

        # Preprocessing step:
        print "Starting preprocessing step."
        preprocess = Pig.compileFromFile("../pigscripts/pagerank_preprocess.pig").bind({
            "INPUT_PATH"            : edges_input,
            "PAGERANKS_OUTPUT_PATH" : "%s/pageranks" % preprocess_dir,
            "NUM_NODES_OUTPUT_PATH" : "%s/num_nodes" % preprocess_dir
        }).runSingle()

        # Update convergence threshold based on the size of the graph (number of nodes)
        num_nodes             = long(str(preprocess.result("num_nodes").iterator().next().get(0)))
        convergence_threshold = long(convergence_threshold * num_nodes * num_nodes)
        print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold) 

        # Iteration step:
        def iteration_param_func(it_num, it_dir):
            if it_num == 1:
                iteration_input = "%s/pageranks" % preprocess_dir
            else:
                iteration_input = "%s/%d/pageranks" % (it_dir, it_num - 1)

            return {
                "INPUT_PATH"                  : iteration_input,
                "DAMPING_FACTOR"              : damping_factor,
                "NUM_NODES"                   : num_nodes,
                "PAGERANKS_OUTPUT_PATH"       : "%s/%d/pageranks"    % (it_dir, it_num),
                "AGG_RANK_CHANGE_OUTPUT_PATH" : "%s/%d/rank_changes" % (it_dir, it_num)
            }

        iteration_result = IterationUtils.iterate_until_convergence(
            "../pigscripts/pagerank_iterate.pig", # the pigscript to iterate
            iteration_dir,                        # temporary iteration outputs will be stored here
            iteration_param_func,                 # takes iteration #, returns Pig parameter dictionary
            "Sum of ordering-rank changes",       # name of the convergence metric
            int,                                  # Python type of the convergence metric
            "aggregate_rank_change",              # alias in the pigscript where the metric is stored to
            convergence_threshold,                # stop when metric less than this
            max_num_iterations                    # or if this many iterations have been performed
        )

        # Postprocesing step:
        print "Starting postprocessing step."

        postprocess_script = """
            pageranks   =   LOAD '$PAGERANKS_INPUT_PATH'   USING PigStorage() AS (id: int, pagerank: double);
            pageranks   =   FILTER pageranks BY pagerank IS NOT NULL;
        """

        if id_name_map:
            postprocess_script += """
                id_name_map =   LOAD '$ID_NAME_MAP_INPUT_PATH' USING PigStorage() AS (id: int, name: chararray);
                with_names  =   FOREACH (JOIN id_name_map BY id, pageranks BY id) GENERATE name, pagerank;
                ordered     =   ORDER with_names BY pagerank DESC;
                rmf $OUTPUT_PATH;
                STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
            """

            postprocess = Pig.compile(postprocess_script).bind({
                "PAGERANKS_INPUT_PATH"   : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
                "ID_NAME_MAP_INPUT_PATH" : id_name_map,
                "OUTPUT_PATH"            : output_path
            }).runSingle()
        else:
            postprocess_script += """
                ordered     =   ORDER pageranks BY pagerank DESC;
                rmf $OUTPUT_PATH;
                STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
            """

            postprocess = Pig.compile(postprocess_script).bind({
                "PAGERANKS_INPUT_PATH"   : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
                "OUTPUT_PATH"            : output_path
            }).runSingle()
#.........这里部分代码省略.........
开发者ID:mortardata,项目名称:bacon-bits,代码行数:103,代码来源:pagerank_lib.py

示例13:

# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
#Passing PIG Script to PYTHON and RUN
#! /usr/bin/python

from org.apache.pig.scripting import Pig

P = Pig.compileFromFile("""myscript.pig""")

input = "original"
output = "output"

result = p.bind({'in':input, 'out':output}).runSingle()
if result.isSuccessful():
	print "Pig job succeeded"
else:
	raise "Pig job failed"
开发者ID:ianlin,项目名称:small-data,代码行数:17,代码来源:execute_on_python.py

示例14: run_script

# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
def run_script():
    import os
    from org.apache.pig.scripting import Pig

    # Specify where the data will come from,
    # and where output data will go after each step

    data_stem = "s3n://jpacker-dev/amazon_products/books_graph/"
    num_vertices_input = data_stem + "num_vertices"
    nodes_input = data_stem + "nodes"
    edges_input = data_stem + "edges"

    output_stem = data_stem + "clustering/"
    preprocess_num_vertices_output = output_stem + "preprocess/num_vertices"
    preprocess_trans_mat_output = output_stem + "preprocess/trans_mat"
    iteration_trans_mat_output_stem = output_stem + "iteration/trans_mat_"
    postprocess_clusters_output = output_stem + "postprocess/clusters"
    postprocess_stats_output = output_stem + "postprocess/stats"

    """
    data_stem = "../fake-fixtures/"
    num_vertices_input = data_stem + "cathedral-num-vertices"
    nodes_input = data_stem + "cathedral-nodes"
    edges_input = data_stem + "cathedral-edges"

    output_stem = data_stem + "cathedral_clustering/"
    preprocess_num_vertices_output = output_stem + "preprocess/num_vertices"
    preprocess_trans_mat_output = output_stem + "preprocess/trans_mat"
    iteration_trans_mat_output_stem = output_stem + "iteration/trans_mat_"
    postprocess_clusters_output = output_stem + "postprocess/clusters"
    postprocess_stats_output = output_stem + "postprocess/stats"
    """

    # Preprocessing step:
    #
    # (1) Generate a transition matrix from the internal edges
    # (2) Copy precomputed count of # vertices
    #     No computation is being done here; this just lets us use Pig to access the data
    #     instead of configuring S3 access manually with boto
    #
    preprocess = Pig.compileFromFile("../pigscripts/clustering_preprocess.pig")
    preprocess_bound = preprocess.bind({ 
        "NUM_VERTICES_INPUT_PATH": num_vertices_input, 
        "EDGES_INPUT_PATH": edges_input, 
        "NUM_VERTICES_OUTPUT_PATH": preprocess_num_vertices_output,
        "TRANS_MAT_OUTPUT_PATH": preprocess_trans_mat_output
    })
    preprocess_stats = preprocess_bound.runSingle()

    # Extract the number of vertices, which we will pass into each iteration as a parameter
    num_vertices = long(str(preprocess_stats.result("num_verts").iterator().next().get(0)))
    
    # Extract the number of edges (including inserted self-loops)
    # We will use this in our convergence metric
    initial_num_edges = long(str(preprocess_stats.getNumberRecords(preprocess_trans_mat_output)))
    
    # Iteration step applying the Markov Clustering operations:
    #
    # (1) Expansion: square the transition matrix ~= take a step in a random walk
    # (2) Inflation: take an elementwise power of the matrix ~= strengthen strong connections, weaken weak ones'
    # (3) Pruning: set small matrix values to zero (since the matrix impl is sparse, this greatly speeds things up)
    # (4) Normalization: renormalize the matrix columnwise to keep it a valid transition matrix
    #
    # I tested several mathematically sensible convergence metrics 
    # (max of max residual for each col, avg of max residual for each col, col kurtosis)
    # but none worked very well. So I'm currently just breaking when the number of edges
    # in an iteration's transition matrix is less than the number of edges in 
    # the initial transition matrix times a constant multiple, which seems to indicate
    # that things are settling down.
    #
    # The algorithm has two parameters:
    # (1) The inflation parameter is an exponential factor which determines the cluster size. higher inflation => smaller clusters
    # (2) Epsilon is a minimum threshold for values in the transition matrix; anything smaller will be pruned (set to zero)
    #     I am not sure how high epsilon can safely be set without significantly degrading the quality of the algorithm
    #     If you run in to performance problems though, raising epsilon will dramatically reduce execution time
    #
    iteration = Pig.compileFromFile("../pigscripts/clustering_iterate.pig")
    max_num_iterations = 7  # most graphs should converge after 4-10 iterations
    num_iterations = 0

    for i in range(1, max_num_iterations + 1):
        iteration_input = preprocess_trans_mat_output if i == 1 else (iteration_trans_mat_output_stem + str(i-1))
        iteration_output = iteration_trans_mat_output_stem + str(i)

        iteration_bound = iteration.bind({
            "INPUT_PATH": iteration_input,
            "ITERATION_OUTPUT_PATH": iteration_output,
            "NUM_VERTICES": num_vertices, 
            "INFLATION_PARAMETER": 1.5,
            "EPSILON": 0.01
        })
        iteration_stats = iteration_bound.runSingle()

        num_iterations += 1
        num_edges = long(str(iteration_stats.getNumberRecords(iteration_output)))
        if num_iterations >= 3 and num_edges < (initial_num_edges * 1.05):
            break

    # Postprocessing step:
    #
#.........这里部分代码省略.........
开发者ID:andresdouglas,项目名称:amazon-product-graph,代码行数:103,代码来源:clustering.py

示例15: run_pagerank

# 需要导入模块: from org.apache.pig.scripting import Pig [as 别名]
# 或者: from org.apache.pig.scripting.Pig import compileFromFile [as 别名]
def run_pagerank():
    """
    Calculates pageranks for Twitter users.

    Three main steps:
        1. Preprocessing: Process input data to:
             a) Count the total number of users.
             b) Prepare initial pagerank values for all users.
        2. Iterative: Calculate new pageranks for each user based on the previous pageranks of the
                      users' followers.
        3. Postprocesing: Find the top pagerank users and join to a separate dataset to find their names.
    """
    # Preprocessing step:
    print "Starting preprocessing step."
    preprocess = Pig.compileFromFile(PREPROCESS_SCRIPT)
    preprocess_bound = preprocess.bind({
        "INPUT_PATH": FOLLOWER_GRAPH_INPUT,
        "PAGERANKS_OUTPUT_PATH": PREPROCESS_PAGERANKS,
        "NUM_USERS_OUTPUT_PATH": PREPROCESS_NUM_USERS
    })
    preprocess_stats = preprocess_bound.runSingle()
    num_users = int(str(preprocess_stats.result("num_users").iterator().next().get(0)))
    convergence_threshold = CONVERGENCE_THRESHOLD / num_users


    # Iteration step:
    iteration = Pig.compileFromFile(PAGERANK_ITERATE_SCRIPT)
    for i in range(MAX_NUM_ITERATIONS):
        print "Starting iteration step: %s" % str(i + 1)

        # Append the iteration number to the input/output stems
        iteration_input = PREPROCESS_PAGERANKS if i == 0 else (ITERATION_PAGERANKS_PREFIX + str(i-1))
        iteration_pageranks_output = ITERATION_PAGERANKS_PREFIX + str(i)
        iteration_max_diff_output = ITERATION_MAX_DIFF_PREFIX + str(i)

        iteration_bound = iteration.bind({
            "INPUT_PATH": iteration_input,
            "DAMPING_FACTOR": DAMPING_FACTOR,
            "NUM_USERS": num_users,
            "PAGERANKS_OUTPUT_PATH": iteration_pageranks_output,
            "MAX_DIFF_OUTPUT_PATH": iteration_max_diff_output
        })
        iteration_stats = iteration_bound.runSingle()

        # If we're below the convergence_threshold break out of the loop.
        max_diff = float(str(iteration_stats.result("max_diff").iterator().next().get(0)))
        if max_diff < CONVERGENCE_THRESHOLD:
            print "Max diff %s under convergence threshold. Stopping." % max_diff
            break
        elif i == MAX_NUM_ITERATIONS-1:
            print "Max diff %s above convergence threshold but hit max number of iterations.  Stopping." \
                    % max_diff
        else:
            print "Max diff %s above convergence threshold. Continuing." % max_diff

    iteration_pagerank_result = ITERATION_PAGERANKS_PREFIX + str(i)

    # Postprocesing step:
    print "Starting postprocessing step."
    postprocess = Pig.compileFromFile(POSTPROCESS_SCRIPT)
    postprocess_bound = postprocess.bind({
        "PAGERANKS_INPUT_PATH": iteration_pagerank_result,
        "USERNAMES_INPUT_PATH": USERNAMES_INPUT,
        "TOP_N": NUM_TOP_USERS,
        "OUTPUT_BUCKET": OUTPUT_BUCKET
    })
    postprocess_stats = postprocess_bound.runSingle()
开发者ID:ManjushaBolishetty,项目名称:DocGraph-Page-Rank,代码行数:69,代码来源:pagerank_final.py


注:本文中的org.apache.pig.scripting.Pig.compileFromFile方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。