Python pyspark.SparkFiles类代码示例

本文整理汇总了Python中pyspark.SparkFiles类的典型用法代码示例。如果您正苦于以下问题：Python SparkFiles类的具体用法？Python SparkFiles怎么用？Python SparkFiles使用的例子？那么, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了SparkFiles类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: train_partition

def train_partition(idx, iterator):
    port = 50000 + idx % 256
    main = SparkFiles.get("main.py")
    architecture = SparkFiles.get("train_val.prototxt")
    model = SparkFiles.get("deepq16.caffemodel")
    solver = SparkFiles.get("solver.prototxt")
    root = SparkFiles.getRootDirectory()
    dset = os.path.join(root, "dset-%02d.hdf5" % idx)

    flag_file = "flags/__BARISTA_READY__.%d" % port
    if os.path.isfile(flag_file):
        os.remove(flag_file)

    #  out = open(os.path.join(root, "barista.log"), 'w')
    subprocess.Popen(["python", main, architecture, model,
                      "--dataset", dset,
                      "--solver", solver,
                      "--dset-size", "30000",
                      "--initial-replay", "20000",
                      "--debug",
                      "--overwrite",
                      "--port", str(port)])

    while not os.path.isfile(flag_file):
        pass

    for step in iterator:
        dc = DummyClient("127.0.0.1", port)
        dc.send(barista.GRAD_UPDATE)
        response = dc.recv()
        yield response

开发者ID:kjchavez，项目名称:distributed-deep-q，代码行数:31，代码来源:ddq.py

示例2: crfexec

def crfexec(sc, inputFilename, outputDirectory, 
            limit=LIMIT, location='hdfs', outputFormat="text", partitions=None):
    crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config")
    crfExecutable = "/usr/local/bin/crf_test"
    crfModelFilename = os.path.join(crfConfigDir, "dig-hair-eye-train.model")

    rdd_pipeinput = sc.textFile(inputFilename)
    rdd_pipeinput.setName('rdd_pipeinput')
    # rdd_pipeinput.persist()

    # DON'T USE SparkFiles.get to fetch the crf_test or model
    # This only works with local Spark (--master local[*])
    if location == 'hdfs':
        cmd = "%s -m %s" % (os.path.basename(crfExecutable), os.path.basename(crfModelFilename))
    elif location == 'local':
        cmd = "%s -m %s" % (SparkFiles.get(os.path.basename(crfExecutable)), SparkFiles.get(os.path.basename(crfModelFilename)))
    print "### %s" % cmd
    rdd_crf = rdd_pipeinput.pipe(cmd)
    
    rdd_final = rdd_crf
    if outputFormat == "sequence":
        rdd_final.saveAsSequenceFile(outputDirectory)
    elif outputFormat == "text":
        rdd_final.saveAsTextFile(outputDirectory)
    else:
        raise RuntimeError("Unrecognized output format: %s" % outputFormat)

开发者ID:cjsanjay，项目名称:dig-crf，代码行数:26，代码来源:crfexec.py

示例3: predict

    def predict(self, X):
        """ Assumes X is an RDD or a list of (data, label) minibatch tuples."""

        if isinstance(X, RDD):
            # Distribute files
            X.context.addFile(self._solver_filename)
            X.context.addFile(self._architecture_filename)
            X.mapPartitions(self.predict)

        solver_filename = \
            SparkFiles.get(self._solver_filename.rsplit('/', 1)[-1])
        architecture_filename = \
            SparkFiles.get(self._architecture_filename.rsplit('/', 1)[-1])

        # Might need to modify path to architecture file inside solver file.
        # Maybe we should do this before shipping the file since all Spark
        # tmp directories will be identically named.

        net = SGDSolver(solver_filename).net

        for minibatch_data, minibatch_label in X:
            # TODO: update function call for latest Caffe
            net.set_input_arrays(minibatch_data,
                                 minibatch_label,
                                 self.input_index)
            output = net.forward(end=self.score_blob)
            scores = output[self.score_blob]
            pred = np.argmax(scores, axis=1).squeeze()
            yield pred

开发者ID:kjchavez，项目名称:pyspark-caffe，代码行数:29，代码来源:sparkcaffe.py

示例4: ship_prototxt_to_data

    def ship_prototxt_to_data(self, rdd):
        rdd.context.addFile(self._solver_filename)
        rdd.context.addFile(self._architecture_filename)
        solver_filename = \
            SparkFiles.get(self._solver_filename.rsplit('/', 1)[-1])
        architecture_filename = \
            SparkFiles.get(self._architecture_filename.rsplit('/', 1)[-1])

        return solver_filename, architecture_filename

开发者ID:kjchavez，项目名称:pyspark-caffe，代码行数:9，代码来源:sparkcaffe.py

示例5: compute_buried_area

		def compute_buried_area(pdb_complex):
			chZ = "chZ"
			
			sasa_complex = -1.0
			sasa_rec = -1.0
			sasa_lig = -1.0
			buried_total = -1.0

			base_name = get_name_model_pdb(pdb_complex)		
			ligand_name = get_ligand_from_receptor_ligand_model(base_name)
			f_pdb_ligand_no_docking = os.path.join(pdb_ligand_path.value,ligand_name+".pdb")		
			f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx")
			
			f_temp_sasa_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_complex.xvg")
			f_temp_sasa_rec = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_rec.xvg")			
			f_temp_sasa_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig.xvg")						
						
			# Makes the index file with the ligand (chain z) and the rest (non chain z)
			script_make_ndx = SparkFiles.get("make_ndx_buried_area_total.sh") #Getting bash script that was copied by addFile command
			command = script_make_ndx + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx	
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()	
					
			command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface System " + " -output System "+ " -xvg none " + " -o " + f_temp_sasa_complex
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()

			# Makes f_temp_sasa_rec file 
			script_make_sasa_rec = SparkFiles.get("make_sasa_rec_buried_area_total.sh") #Getting bash script that was copied by addFile command
			command = script_make_sasa_rec + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " " + f_temp_sasa_rec
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()	

			command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface chZ " + " -output chZ "+ " -xvg none " + " -o " +  f_temp_sasa_lig
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()

			sasa_complex = get_value_from_xvg_sasa(f_temp_sasa_complex)
			sasa_rec = get_value_from_xvg_sasa(f_temp_sasa_rec)
			sasa_lig = get_value_from_xvg_sasa(f_temp_sasa_lig)

			buried_total = sasa_rec + sasa_lig - sasa_complex

			#Generating result - See column sorting because resultaed file will be created based on this sorting
			returned_list = (base_name, buried_total)

			#Deleting files
			os.remove(f_ndx)
			os.remove(f_temp_sasa_complex)
			os.remove(f_temp_sasa_rec)
			os.remove(f_temp_sasa_lig)			

			return returned_list

开发者ID:Tidebringer，项目名称:drugdesign，代码行数:53，代码来源:buried_areas.py

示例6: test_add_file_locally

 def test_add_file_locally(self):
     path = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt")
     self.sc.addFile(path)
     download_path = SparkFiles.get("hello.txt")
     self.assertNotEqual(path, download_path)
     with open(download_path) as test_file:
         self.assertEqual("Hello World!\n", test_file.readline())

开发者ID:JingchengDu，项目名称:spark，代码行数:7，代码来源:test_context.py

示例7: load_timestep

def load_timestep(timestep):
    path = data_path
    if download or config.copy_local:
        path = SparkFiles.get('pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc')
    data = Dataset(path)
    pr = data.variables['pr']
    step = pr[timestep]
    # Return valid values
    return (timestep, step[~step.mask])

开发者ID:OpenGeoscience，项目名称:nex，代码行数:9，代码来源:timestep_mean.py

示例8: spawn_barista

def spawn_barista(partition):
    main = SparkFiles.get("main.py")
    architecture = SparkFiles.get("train_val.prototxt")
    model = SparkFiles.get("deepq16.caffemodel")
    solver = SparkFiles.get("solver.prototxt")
    root = SparkFiles.getRootDirectory()
    dset = os.path.join(root, "dset.hdf5")
    flag_file = "flags/__BARISTA_READY__"
    if os.path.isfile(flag_file):
        os.remove("flags/__BARISTA_READY__")

    out = open(os.path.join(root, "barista.log"), 'w')
    subprocess.Popen(["python", main, architecture, model,
                      "--dataset", dset,
                      "--solver", solver],
                     stdout=out,
                     stderr=subprocess.STDOUT)

    while not os.path.isfile("flags/__BARISTA_READY__"):
        pass

开发者ID:kjchavez，项目名称:distributed-deep-q，代码行数:20，代码来源:ddq.py

示例9: partitionIp2city

def partitionIp2city(iter):
    from geoip2 import database

    def ip2city(ip):
        try:
           city = reader.city(ip).city.name
        except:
            city = 'not found'
        return city

    reader = database.Reader(SparkFiles.get(geoDBpath))
    #return [ip2city(ip) for ip in iter]
    return ip2city(iter)

开发者ID:jellylidong，项目名称:DeepDefense_dataStatistics，代码行数:13，代码来源:testGeoSpark.py

示例10: main

def main(sc):
  sqlContext = SQLContext(sc)
  df = sqlContext.jsonFile(DATA_PATH)
  #add the filter file
  sc.addFile(FILTER_TERMS_FILE_PATH)
  filter_terms = sc.textFile(SparkFiles.get("freebase-symptoms-just-terms.txt"))
  global filter_terms_set_bc
  filter_terms_set_bc = sc.broadcast(Set(filter_terms.collect()))
  # Register the DataFrame as a table.
  df.registerTempTable("tweet")
  results = sqlContext.sql("SELECT id,user.id,user.lang,created_at, coordinates,text FROM tweet where user.lang='en'")
  #filter tweets to find health related tweets
  filter_health_tweets = results.rdd.filter(healthFilter)
  filter_health_tweets.mapPartitions(writeRecords).saveAsTextFile("output/")

开发者ID:LeotisBuchanan，项目名称:stream-data-analysis-realtime，代码行数:14，代码来源:filterandConvertToCSV.py

示例11: compute_buried_area_ligand

		def compute_buried_area_ligand(pdb_complex):
			chZ = "chZ"
			buried_lig_rec_perc = -1.0
			buried_lig_rec = -1.0
			buried_lig_lig = -1.0
			buried_lig_lig_perc = -1.0
			base_name = get_name_model_pdb(pdb_complex)		
			ligand_name = get_ligand_from_receptor_ligand_model(base_name)
			receptor_name = get_receptor_from_receptor_ligand_model(base_name)
			pose = get_model_from_receptor_ligand_model(base_name)						
			pdb_before_vs = os.path.join(pdb_ligand_path.value,ligand_name+".pdb")			
			#ndx files					
			f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx")			
			#xvg files
			xvg_temp_sasa_lig_pose = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_pose"+".xvg")
			xvg_temp_sasa_lig_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_complex"+".xvg")
			xvg_temp_sasa_lig_min = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_min"+".xvg")
			# Creates a selection with the residues that are closer than 6A to the ligand
			script_make_ndx_buried_area_ligand = SparkFiles.get("make_ndx_buried_area_ligand.sh") #Getting bash script that was copied by addFile command
			command = script_make_ndx_buried_area_ligand + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " "+  xvg_temp_sasa_lig_pose + " "+ str(probe.value)  + " "+ str(ndots.value)  + " "+  xvg_temp_sasa_lig_complex  + " "+ pdb_before_vs  + " "+  xvg_temp_sasa_lig_min
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()			
			try:
				# SASA of the isolated ligand in the pose conformation			
				sasa_lig_pose = get_value_from_xvg_sasa(xvg_temp_sasa_lig_pose)
				# SASA of the complexed ligand in the pose conformation
				sasa_lig_complex = get_value_from_xvg_sasa(xvg_temp_sasa_lig_complex)
				# SASA of the isolated ligand in its energy-minimized conformation. Only for carbohydrates!
				sasa_lig_min = get_value_from_xvg_sasa(xvg_temp_sasa_lig_min)
				# Area of the ligand which is buried in the receptor
				buried_lig_rec = sasa_lig_pose - sasa_lig_complex
				buried_lig_rec_perc = buried_lig_rec / sasa_lig_pose
				# Area of the ligand in the pose conformation which is buried in itself when compared to the energy-minimized conformation
				buried_lig_lig = sasa_lig_min - sasa_lig_pose
				buried_lig_lig_perc = buried_lig_lig / sasa_lig_min
				returned_list = (base_name, buried_lig_rec, buried_lig_rec_perc, buried_lig_lig, buried_lig_lig_perc)

				#Deleting files
				os.remove(f_ndx)			
				os.remove(xvg_temp_sasa_lig_pose)
				os.remove(xvg_temp_sasa_lig_complex)
				os.remove(xvg_temp_sasa_lig_min)

				return returned_list
			except:
				return (base_name, float(0.0), float(0.0), float(0.0), float(0.0))

开发者ID:rodrigofaccioli，项目名称:drugdesign，代码行数:46，代码来源:buried_area_ligand.py

示例12: partition_processor

        def partition_processor(partitionlinechunks):
            """
            Partition logic for pyspark parallel processing
            """

            model_pipe_object = joblib.load(SparkFiles.get("mmp_phase1_D2.clf"))

            def set_predictions(x):
                segment = model_pipe_object.predict_proba(x)
                return segment

            df_with_nan = build_dataframe(partitionlinechunks)
            df_with_newline = df_with_nan.replace(u"NULL", pd.np.nan)
            behaviour_df = df_with_newline.replace(u"\\N", pd.np.nan)
            predictions_ser = set_predictions(behaviour_df)

            predictions_list = [value for value in [zip(predictions_ser.index, predictions_ser.loc[:,'A'], predictions_ser.loc[:,'Y'], predictions_ser.loc[:,'segment'], predictions_ser.loc[:,'model_version'])]]
            return iter(predictions_list)

开发者ID:vinodhkrishnaraju，项目名称:Stacked-Classifier，代码行数:18，代码来源:apply_model.py

示例13: load_matrix

def load_matrix(
        filename,
        sc,
        num_users=NUM_USER,
        num_items=NUM_SONG
):
    global alpha
    global total
    global num_zeros

    print 'Start to load matrix...'

    t0 = time.time()
    counts = np.zeros((num_users, num_items))
    total = 0.0
    num_zeros = num_users * num_items

    url = "s3n://spark-mllib/fastcode/data/" + filename
    # url = "hdfs://localhost:9000/data/" + filename
    print 'loading... ' + url
    # data = sc.textFile(url)
    # data.map(lambda l: fill_maxtrix(l, counts))

    sc.addFile(url)
    with open(SparkFiles.get(filename)) as f:
        for line in f:
            fill_maxtrix(line, counts)

    alpha = num_zeros / total
    print 'alpha %.2f' % alpha
    counts *= alpha

    t1 = time.time()
    print 'Finished loading matrix in %f seconds\n' % (t1 - t0)
    print 'Total entry:', num_users * num_items
    print 'Non-zeros:', num_users * num_items - num_zeros

    counts = sparse.csr_matrix(counts)

    return counts, num_users * num_items - num_zeros

开发者ID:ken61502，项目名称:fastcode-music-recommendation，代码行数:40，代码来源:svd_imf_spark.py

示例14: _getCountryByIP

def _getCountryByIP(ip):
    citydb = geoIP.Reader(SparkFiles.get('GeoLite2-City.mmdb'))
    return (citydb.city(ip).country.name or u'Unknown').encode()

开发者ID:forgeservicelab，项目名称:forge.GeoAccess，代码行数:3，代码来源:forgeGeoAccess.py

示例15: hasDistInfo

distScript = os.getcwd()+"/src/R/finddistance.R"
distScriptName = "finddistance.R"
sc.addFile(distScript)


def hasDistInfo(call):
    """Verify that a call has the fields required to compute the distance"""
    requiredFields = ["mylat", "mylong", "contactlat", "contactlong"]
    return all(map(lambda f: call[f], requiredFields))


def formatCall(call):
    """Format a call so that it can be parsed by our R program"""
    return "{0},{1},{2},{3}".format(
        call["mylat"], call["mylong"],
        call["contactlat"], call["contactlong"])

pipeInputs = contactsContactList.values().flatMap(
    lambda calls: map(formatCall, filter(hasDistInfo, calls)))
distances = pipeInputs.pipe(SparkFiles.get(distScriptName))
print distances.collect()
# Convert our RDD of strings to numeric data so we can compute stats and
# remove the outliers.
distanceNumerics = distances.map(lambda string: float(string))
stats = distanceNumerics.stats()
stddev = stats.stdev()
mean = stats.mean()
reasonableDistances = distanceNumerics.filter(
    lambda x: math.fabs(x - mean) < 3 * stddev)
print reasonableDistances.collect()

开发者ID:153485062，项目名称:learning-spark，代码行数:30，代码来源:ChapterSixExample.py

注：本文中的pyspark.SparkFiles类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。