当前位置: 首页>>代码示例>>Python>>正文


Python SparkFiles.get方法代码示例

本文整理汇总了Python中pyspark.SparkFiles.get方法的典型用法代码示例。如果您正苦于以下问题:Python SparkFiles.get方法的具体用法?Python SparkFiles.get怎么用?Python SparkFiles.get使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.SparkFiles的用法示例。


在下文中一共展示了SparkFiles.get方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: crfexec

# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def crfexec(sc, inputFilename, outputDirectory, 
            limit=LIMIT, location='hdfs', outputFormat="text", partitions=None):
    crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config")
    crfExecutable = "/usr/local/bin/crf_test"
    crfModelFilename = os.path.join(crfConfigDir, "dig-hair-eye-train.model")

    rdd_pipeinput = sc.textFile(inputFilename)
    rdd_pipeinput.setName('rdd_pipeinput')
    # rdd_pipeinput.persist()

    # DON'T USE SparkFiles.get to fetch the crf_test or model
    # This only works with local Spark (--master local[*])
    if location == 'hdfs':
        cmd = "%s -m %s" % (os.path.basename(crfExecutable), os.path.basename(crfModelFilename))
    elif location == 'local':
        cmd = "%s -m %s" % (SparkFiles.get(os.path.basename(crfExecutable)), SparkFiles.get(os.path.basename(crfModelFilename)))
    print "### %s" % cmd
    rdd_crf = rdd_pipeinput.pipe(cmd)
    
    rdd_final = rdd_crf
    if outputFormat == "sequence":
        rdd_final.saveAsSequenceFile(outputDirectory)
    elif outputFormat == "text":
        rdd_final.saveAsTextFile(outputDirectory)
    else:
        raise RuntimeError("Unrecognized output format: %s" % outputFormat)
开发者ID:cjsanjay,项目名称:dig-crf,代码行数:28,代码来源:crfexec.py

示例2: train_partition

# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def train_partition(idx, iterator):
    port = 50000 + idx % 256
    main = SparkFiles.get("main.py")
    architecture = SparkFiles.get("train_val.prototxt")
    model = SparkFiles.get("deepq16.caffemodel")
    solver = SparkFiles.get("solver.prototxt")
    root = SparkFiles.getRootDirectory()
    dset = os.path.join(root, "dset-%02d.hdf5" % idx)

    flag_file = "flags/__BARISTA_READY__.%d" % port
    if os.path.isfile(flag_file):
        os.remove(flag_file)

    #  out = open(os.path.join(root, "barista.log"), 'w')
    subprocess.Popen(["python", main, architecture, model,
                      "--dataset", dset,
                      "--solver", solver,
                      "--dset-size", "30000",
                      "--initial-replay", "20000",
                      "--debug",
                      "--overwrite",
                      "--port", str(port)])

    while not os.path.isfile(flag_file):
        pass

    for step in iterator:
        dc = DummyClient("127.0.0.1", port)
        dc.send(barista.GRAD_UPDATE)
        response = dc.recv()
        yield response
开发者ID:kjchavez,项目名称:distributed-deep-q,代码行数:33,代码来源:ddq.py

示例3: predict

# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
    def predict(self, X):
        """ Assumes X is an RDD or a list of (data, label) minibatch tuples."""

        if isinstance(X, RDD):
            # Distribute files
            X.context.addFile(self._solver_filename)
            X.context.addFile(self._architecture_filename)
            X.mapPartitions(self.predict)

        solver_filename = \
            SparkFiles.get(self._solver_filename.rsplit('/', 1)[-1])
        architecture_filename = \
            SparkFiles.get(self._architecture_filename.rsplit('/', 1)[-1])

        # Might need to modify path to architecture file inside solver file.
        # Maybe we should do this before shipping the file since all Spark
        # tmp directories will be identically named.

        net = SGDSolver(solver_filename).net

        for minibatch_data, minibatch_label in X:
            # TODO: update function call for latest Caffe
            net.set_input_arrays(minibatch_data,
                                 minibatch_label,
                                 self.input_index)
            output = net.forward(end=self.score_blob)
            scores = output[self.score_blob]
            pred = np.argmax(scores, axis=1).squeeze()
            yield pred
开发者ID:kjchavez,项目名称:pyspark-caffe,代码行数:31,代码来源:sparkcaffe.py

示例4: ship_prototxt_to_data

# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
    def ship_prototxt_to_data(self, rdd):
        rdd.context.addFile(self._solver_filename)
        rdd.context.addFile(self._architecture_filename)
        solver_filename = \
            SparkFiles.get(self._solver_filename.rsplit('/', 1)[-1])
        architecture_filename = \
            SparkFiles.get(self._architecture_filename.rsplit('/', 1)[-1])

        return solver_filename, architecture_filename
开发者ID:kjchavez,项目名称:pyspark-caffe,代码行数:11,代码来源:sparkcaffe.py

示例5: compute_buried_area

# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
		def compute_buried_area(pdb_complex):
			chZ = "chZ"
			
			sasa_complex = -1.0
			sasa_rec = -1.0
			sasa_lig = -1.0
			buried_total = -1.0

			base_name = get_name_model_pdb(pdb_complex)		
			ligand_name = get_ligand_from_receptor_ligand_model(base_name)
			f_pdb_ligand_no_docking = os.path.join(pdb_ligand_path.value,ligand_name+".pdb")		
			f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx")
			
			f_temp_sasa_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_complex.xvg")
			f_temp_sasa_rec = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_rec.xvg")			
			f_temp_sasa_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig.xvg")						
						
			# Makes the index file with the ligand (chain z) and the rest (non chain z)
			script_make_ndx = SparkFiles.get("make_ndx_buried_area_total.sh") #Getting bash script that was copied by addFile command
			command = script_make_ndx + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx	
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()	
					
			command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface System " + " -output System "+ " -xvg none " + " -o " + f_temp_sasa_complex
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()

			# Makes f_temp_sasa_rec file 
			script_make_sasa_rec = SparkFiles.get("make_sasa_rec_buried_area_total.sh") #Getting bash script that was copied by addFile command
			command = script_make_sasa_rec + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " " + f_temp_sasa_rec
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()	

			command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface chZ " + " -output chZ "+ " -xvg none " + " -o " +  f_temp_sasa_lig
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()

			sasa_complex = get_value_from_xvg_sasa(f_temp_sasa_complex)
			sasa_rec = get_value_from_xvg_sasa(f_temp_sasa_rec)
			sasa_lig = get_value_from_xvg_sasa(f_temp_sasa_lig)

			buried_total = sasa_rec + sasa_lig - sasa_complex

			#Generating result - See column sorting because resultaed file will be created based on this sorting
			returned_list = (base_name, buried_total)

			#Deleting files
			os.remove(f_ndx)
			os.remove(f_temp_sasa_complex)
			os.remove(f_temp_sasa_rec)
			os.remove(f_temp_sasa_lig)			

			return returned_list			
开发者ID:Tidebringer,项目名称:drugdesign,代码行数:55,代码来源:buried_areas.py

示例6: test_add_file_locally

# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
 def test_add_file_locally(self):
     path = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt")
     self.sc.addFile(path)
     download_path = SparkFiles.get("hello.txt")
     self.assertNotEqual(path, download_path)
     with open(download_path) as test_file:
         self.assertEqual("Hello World!\n", test_file.readline())
开发者ID:JingchengDu,项目名称:spark,代码行数:9,代码来源:test_context.py

示例7: load_timestep

# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def load_timestep(timestep):
    path = data_path
    if download or config.copy_local:
        path = SparkFiles.get('pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc')
    data = Dataset(path)
    pr = data.variables['pr']
    step = pr[timestep]
    # Return valid values
    return (timestep, step[~step.mask])
开发者ID:OpenGeoscience,项目名称:nex,代码行数:11,代码来源:timestep_mean.py

示例8: spawn_barista

# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def spawn_barista(partition):
    main = SparkFiles.get("main.py")
    architecture = SparkFiles.get("train_val.prototxt")
    model = SparkFiles.get("deepq16.caffemodel")
    solver = SparkFiles.get("solver.prototxt")
    root = SparkFiles.getRootDirectory()
    dset = os.path.join(root, "dset.hdf5")
    flag_file = "flags/__BARISTA_READY__"
    if os.path.isfile(flag_file):
        os.remove("flags/__BARISTA_READY__")

    out = open(os.path.join(root, "barista.log"), 'w')
    subprocess.Popen(["python", main, architecture, model,
                      "--dataset", dset,
                      "--solver", solver],
                     stdout=out,
                     stderr=subprocess.STDOUT)

    while not os.path.isfile("flags/__BARISTA_READY__"):
        pass
开发者ID:kjchavez,项目名称:distributed-deep-q,代码行数:22,代码来源:ddq.py

示例9: partitionIp2city

# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def partitionIp2city(iter):
    from geoip2 import database

    def ip2city(ip):
        try:
           city = reader.city(ip).city.name
        except:
            city = 'not found'
        return city

    reader = database.Reader(SparkFiles.get(geoDBpath))
    #return [ip2city(ip) for ip in iter]
    return ip2city(iter)
开发者ID:jellylidong,项目名称:DeepDefense_dataStatistics,代码行数:15,代码来源:testGeoSpark.py

示例10: main

# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def main(sc):
  sqlContext = SQLContext(sc)
  df = sqlContext.jsonFile(DATA_PATH)
  #add the filter file
  sc.addFile(FILTER_TERMS_FILE_PATH)
  filter_terms = sc.textFile(SparkFiles.get("freebase-symptoms-just-terms.txt"))
  global filter_terms_set_bc
  filter_terms_set_bc = sc.broadcast(Set(filter_terms.collect()))
  # Register the DataFrame as a table.
  df.registerTempTable("tweet")
  results = sqlContext.sql("SELECT id,user.id,user.lang,created_at, coordinates,text FROM tweet where user.lang='en'")
  #filter tweets to find health related tweets
  filter_health_tweets = results.rdd.filter(healthFilter)
  filter_health_tweets.mapPartitions(writeRecords).saveAsTextFile("output/")
开发者ID:LeotisBuchanan,项目名称:stream-data-analysis-realtime,代码行数:16,代码来源:filterandConvertToCSV.py

示例11: compute_buried_area_ligand

# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
		def compute_buried_area_ligand(pdb_complex):
			chZ = "chZ"
			buried_lig_rec_perc = -1.0
			buried_lig_rec = -1.0
			buried_lig_lig = -1.0
			buried_lig_lig_perc = -1.0
			base_name = get_name_model_pdb(pdb_complex)		
			ligand_name = get_ligand_from_receptor_ligand_model(base_name)
			receptor_name = get_receptor_from_receptor_ligand_model(base_name)
			pose = get_model_from_receptor_ligand_model(base_name)						
			pdb_before_vs = os.path.join(pdb_ligand_path.value,ligand_name+".pdb")			
			#ndx files					
			f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx")			
			#xvg files
			xvg_temp_sasa_lig_pose = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_pose"+".xvg")
			xvg_temp_sasa_lig_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_complex"+".xvg")
			xvg_temp_sasa_lig_min = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_min"+".xvg")
			# Creates a selection with the residues that are closer than 6A to the ligand
			script_make_ndx_buried_area_ligand = SparkFiles.get("make_ndx_buried_area_ligand.sh") #Getting bash script that was copied by addFile command
			command = script_make_ndx_buried_area_ligand + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " "+  xvg_temp_sasa_lig_pose + " "+ str(probe.value)  + " "+ str(ndots.value)  + " "+  xvg_temp_sasa_lig_complex  + " "+ pdb_before_vs  + " "+  xvg_temp_sasa_lig_min
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()			
			try:
				# SASA of the isolated ligand in the pose conformation			
				sasa_lig_pose = get_value_from_xvg_sasa(xvg_temp_sasa_lig_pose)
				# SASA of the complexed ligand in the pose conformation
				sasa_lig_complex = get_value_from_xvg_sasa(xvg_temp_sasa_lig_complex)
				# SASA of the isolated ligand in its energy-minimized conformation. Only for carbohydrates!
				sasa_lig_min = get_value_from_xvg_sasa(xvg_temp_sasa_lig_min)
				# Area of the ligand which is buried in the receptor
				buried_lig_rec = sasa_lig_pose - sasa_lig_complex
				buried_lig_rec_perc = buried_lig_rec / sasa_lig_pose
				# Area of the ligand in the pose conformation which is buried in itself when compared to the energy-minimized conformation
				buried_lig_lig = sasa_lig_min - sasa_lig_pose
				buried_lig_lig_perc = buried_lig_lig / sasa_lig_min
				returned_list = (base_name, buried_lig_rec, buried_lig_rec_perc, buried_lig_lig, buried_lig_lig_perc)

				#Deleting files
				os.remove(f_ndx)			
				os.remove(xvg_temp_sasa_lig_pose)
				os.remove(xvg_temp_sasa_lig_complex)
				os.remove(xvg_temp_sasa_lig_min)

				return returned_list
			except:
				return (base_name, float(0.0), float(0.0), float(0.0), float(0.0))
开发者ID:rodrigofaccioli,项目名称:drugdesign,代码行数:48,代码来源:buried_area_ligand.py

示例12: partition_processor

# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
        def partition_processor(partitionlinechunks):
            """
            Partition logic for pyspark parallel processing
            """

            model_pipe_object = joblib.load(SparkFiles.get("mmp_phase1_D2.clf"))

            def set_predictions(x):
                segment = model_pipe_object.predict_proba(x)
                return segment

            df_with_nan = build_dataframe(partitionlinechunks)
            df_with_newline = df_with_nan.replace(u"NULL", pd.np.nan)
            behaviour_df = df_with_newline.replace(u"\\N", pd.np.nan)
            predictions_ser = set_predictions(behaviour_df)

            predictions_list = [value for value in [zip(predictions_ser.index, predictions_ser.loc[:,'A'], predictions_ser.loc[:,'Y'], predictions_ser.loc[:,'segment'], predictions_ser.loc[:,'model_version'])]]
            return iter(predictions_list)
开发者ID:vinodhkrishnaraju,项目名称:Stacked-Classifier,代码行数:20,代码来源:apply_model.py

示例13: load_matrix

# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def load_matrix(
        filename,
        sc,
        num_users=NUM_USER,
        num_items=NUM_SONG
):
    global alpha
    global total
    global num_zeros

    print 'Start to load matrix...'

    t0 = time.time()
    counts = np.zeros((num_users, num_items))
    total = 0.0
    num_zeros = num_users * num_items

    url = "s3n://spark-mllib/fastcode/data/" + filename
    # url = "hdfs://localhost:9000/data/" + filename
    print 'loading... ' + url
    # data = sc.textFile(url)
    # data.map(lambda l: fill_maxtrix(l, counts))

    sc.addFile(url)
    with open(SparkFiles.get(filename)) as f:
        for line in f:
            fill_maxtrix(line, counts)

    alpha = num_zeros / total
    print 'alpha %.2f' % alpha
    counts *= alpha

    t1 = time.time()
    print 'Finished loading matrix in %f seconds\n' % (t1 - t0)
    print 'Total entry:', num_users * num_items
    print 'Non-zeros:', num_users * num_items - num_zeros

    counts = sparse.csr_matrix(counts)

    return counts, num_users * num_items - num_zeros
开发者ID:ken61502,项目名称:fastcode-music-recommendation,代码行数:42,代码来源:svd_imf_spark.py

示例14: driver

# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def driver(sc, inputFilename, outputDirectory, 
           crfExecutable, crfScript, 
           featureListFilename, crfModelFilename, 
           eyeColorRef, eyeColorConfig, hairRef, hairConfig, 
           limit=limit, location='hdfs', outputFormat="text", partitions=None):
    dump = False
    partitions = None

    # Program to compute CRF++
    c = crf_features.CrfFeatures(featureListFilename)
    # Add files to be downloaded with this Spark job on every node.
    sc.addFile(crfExecutable)
    sc.addFile(crfScript)
    sc.addFile(crfModelFilename)

    # Map to reference sets
    smEyeColor = HybridJaccard(ref_path=eyeColorRef, config_path=eyeColorConfig)
    smHairColor = HybridJaccard(ref_path=hairRef, config_path=hairConfig)

    if location == "hdfs":
        print "We want to do hdfs dfs -rm -r %s" % outputDirectory
    elif location == "local":
        try:
            shutil.rmtree(outputDirectory)
            print "rmtree %s" % outputDirectory
        except:
            pass
    else:
        raise RuntimeError("No such location: %s" % location)

    rdd_sequence_file_input = sc.sequenceFile(inputFilename)
    rdd_sequence_file_input.setName('rdd_sequence_file_input')
    # rdd_sequence_file_input.persist()
    
    origSize = rdd_sequence_file_input.count()
#     if limit:
#         rdd = sc.parallelize(rdd_sequence_file_input.take(limit))
    if partitions:
        rdd_sequence_file_input = rdd_sequence_file_input.repartition(partitions)
    print "### input %s: %d ads (orig %s, limit was %s), %d partitions" % (inputFilename, rdd_sequence_file_input.count(), origSize, limit, rdd_sequence_file_input.getNumPartitions())

    rdd_json = rdd_sequence_file_input.mapValues(lambda x: json.loads(x))
    rdd_json.setName('rdd_json')
    # rdd_json.persist()

    # all below should also be done for title
    rdd_body = rdd_json.mapValues(lambda x: extract_body(x))
    rdd_body.setName('rdd_body')
    # rdd_body.persist()
    if dump:
        rdd_body.saveAsTextFile(ff("body"))
        
    rdd_body_tokens = rdd_body.mapValues(lambda x: textTokens(x))
    rdd_body_tokens.setName('rdd_body_tokens')
    # rdd_body_tokens.persist()
    if dump:
        rdd_body_tokens.saveAsTextFile(ff("body_tokens"))

    rdd_features = rdd_body_tokens.map(lambda x: (x[0], c.computeFeatMatrix(x[1], False, addLabels=[x[0]], addIndex=True)))
    rdd_features.setName('rdd_features')
    # rdd_features.persist()
    if dump:
        rdd_features.saveAsTextFile(ff("features"))
    
    # rdd_pipeinput = rdd_features.mapValues(lambda x: base64.b64encode(vectorToString(x)))
    rdd_pipeinput = rdd_features.mapValues(lambda x: vectorToString(x))
    rdd_pipeinput.setName('rdd_pipeinput')
    # rdd_pipeinput.persist()
    if dump:
        rdd_pipeinput.values().saveAsTextFile(ff("pi"))
    # This caused a cannot concatenate string + None error
    # rdd_pipeinput.saveAsTextFile(outputDirectory + "-pipeinput")

    # DON'T USE SparkFiles.get to fetch the crf_test or model
    # This only works with local Spark (--master local[*])
    if location == 'hdfs':
        cmd = "%s %s" % (os.path.basename(crfScript), os.path.basename(crfModelFilename))
    elif location == 'local':
        cmd = "%s %s" % (SparkFiles.get(os.path.basename(crfScript)), SparkFiles.get(os.path.basename(crfModelFilename)))
    print "### %s" % cmd
    rdd_pipeinput.saveAsTextFile(ff("before"))
    exit(0)

    rdd_crf_b64 = rdd_pipeinput.values().pipe(cmd)
    rdd_crf_b64.setName('rdd_crf_b64')
    # rdd_crf_b64.persist()
    if dump:
        rdd_crf_b64.saveAsTextFile(ff("po"))

    # Go directly from base64 output to a reconstructed tuple format mapping URI to vector of vectors, 
    # with empty string suffix indicating blank line
    # This is key for avoiding the groupBy step
    rdd_restore = rdd_crf_b64.map(lambda x: restore(x))
    rdd_restore.setName('rdd_restore')
    # rdd_restore.persist()
    if dump:
        rdd_restore.saveAsTextFile(ff("restore"))

    # ### WE NO LONGER HAVE TO GROUPBY
    # ### BUT WE MUST TREAT EACH LINE INDIVIDUALLY NOW
#.........这里部分代码省略.........
开发者ID:cjsanjay,项目名称:dig-crf,代码行数:103,代码来源:prepdriver-old.py

示例15: driver

# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def driver(sc, inputFilename, outputDirectory, 
           crfExecutable, crfScript, 
           featureListFilename, crfModelFilename, 
           eyeColorRef, eyeColorConfig, hairRef, hairConfig, 
           limit=limit, location='hdfs', outputFormat="text", partitions=None):
    dump = False
    partitions = 8

    # Program to compute CRF++
    c = crf_features.CrfFeatures(featureListFilename)
    # Add files to be downloaded with this Spark job on every node.
    sc.addFile(crfExecutable)
    sc.addFile(crfScript)
    sc.addFile(crfModelFilename)

    # Map to reference sets
    smEyeColor = HybridJaccard(ref_path=eyeColorRef, config_path=eyeColorConfig)
    smHairColor = HybridJaccard(ref_path=hairRef, config_path=hairConfig)

    if location == "hdfs":
        print "We want to do hdfs dfs -rm -r %s" % outputDirectory
    elif location == "local":
        try:
            shutil.rmtree(outputDirectory)
            print "rmtree %s" % outputDirectory
        except:
            pass
    else:
        raise RuntimeError("No such location: %s" % location)

    rdd_sequence_file_input = sc.sequenceFile(inputFilename)
    rdd_sequence_file_input.setName('rdd_sequence_file_input')
    # rdd_sequence_file_input.persist()
    
    origSize = rdd_sequence_file_input.count()
#     if limit:
#         rdd = sc.parallelize(rdd_sequence_file_input.take(limit))
    if partitions:
        rdd_sequence_file_input = rdd_sequence_file_input.repartition(partitions)
    print "### input %s: %d ads (orig %s, limit was %s), %d partitions" % (inputFilename, rdd_sequence_file_input.count(), origSize, limit, rdd_sequence_file_input.getNumPartitions())

    if location == 'hdfs':
        cmd = "%s %s" % (os.path.basename(crfScript), os.path.basename(crfModelFilename))
    elif location == 'local':
        cmd = "%s %s" % (SparkFiles.get(os.path.basename(crfScript)), SparkFiles.get(os.path.basename(crfModelFilename)))
    print "### %s" % cmd

    # ### WE NO LONGER HAVE TO GROUPBY
    # ### BUT WE MUST TREAT EACH LINE INDIVIDUALLY NOW
    # rdd_withuri = sc.parallelize(rdd_withuri.take(10))

    rdd_final = rdd_sequence_file_input.mapValues(lambda x: json.loads(x)).mapValues(lambda x: extract_body(x)).mapValues(lambda x: textTokens(x)).map(lambda x: (x[0], c.computeFeatMatrix(x[1], False, addLabels=[x[0]], addIndex=True))).mapValues(lambda x: base64.b64encode(vectorToString(x))).values().pipe(cmd).map(lambda x: restore(x)).mapValues(lambda x: computeSpans(x, indexed=True)).filter(lambda p: p[1]).flatMapValues(lambda x: list(x)).mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor, "hairType": smHairColor})).mapValues(lambda x: json.dumps(x))

    empty = rdd_final.isEmpty()
    if not empty:
        l = "unknown>1"
        print "### writing %s output (%s records) to %s" % (outputFormat, l, outputDirectory)
        print len(rdd_final.collect())
#         if outputFormat == "sequence":
#             rdd_final.saveAsSequenceFile(outputDirectory)
#         elif outputFormat == "text":
#             rdd_final.saveAsTextFile(outputDirectory)
#         else:
#             raise RuntimeError("Unrecognized output format: %s" % outputFormat)
    else:
        print "### No records: no output into %s" % (outputDirectory)
开发者ID:cjsanjay,项目名称:dig-crf,代码行数:68,代码来源:driver2.py


注:本文中的pyspark.SparkFiles.get方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。