本文整理汇总了Python中pyspark.SparkFiles类的典型用法代码示例。如果您正苦于以下问题:Python SparkFiles类的具体用法?Python SparkFiles怎么用?Python SparkFiles使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了SparkFiles类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: train_partition
def train_partition(idx, iterator):
port = 50000 + idx % 256
main = SparkFiles.get("main.py")
architecture = SparkFiles.get("train_val.prototxt")
model = SparkFiles.get("deepq16.caffemodel")
solver = SparkFiles.get("solver.prototxt")
root = SparkFiles.getRootDirectory()
dset = os.path.join(root, "dset-%02d.hdf5" % idx)
flag_file = "flags/__BARISTA_READY__.%d" % port
if os.path.isfile(flag_file):
os.remove(flag_file)
# out = open(os.path.join(root, "barista.log"), 'w')
subprocess.Popen(["python", main, architecture, model,
"--dataset", dset,
"--solver", solver,
"--dset-size", "30000",
"--initial-replay", "20000",
"--debug",
"--overwrite",
"--port", str(port)])
while not os.path.isfile(flag_file):
pass
for step in iterator:
dc = DummyClient("127.0.0.1", port)
dc.send(barista.GRAD_UPDATE)
response = dc.recv()
yield response
示例2: crfexec
def crfexec(sc, inputFilename, outputDirectory,
limit=LIMIT, location='hdfs', outputFormat="text", partitions=None):
crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config")
crfExecutable = "/usr/local/bin/crf_test"
crfModelFilename = os.path.join(crfConfigDir, "dig-hair-eye-train.model")
rdd_pipeinput = sc.textFile(inputFilename)
rdd_pipeinput.setName('rdd_pipeinput')
# rdd_pipeinput.persist()
# DON'T USE SparkFiles.get to fetch the crf_test or model
# This only works with local Spark (--master local[*])
if location == 'hdfs':
cmd = "%s -m %s" % (os.path.basename(crfExecutable), os.path.basename(crfModelFilename))
elif location == 'local':
cmd = "%s -m %s" % (SparkFiles.get(os.path.basename(crfExecutable)), SparkFiles.get(os.path.basename(crfModelFilename)))
print "### %s" % cmd
rdd_crf = rdd_pipeinput.pipe(cmd)
rdd_final = rdd_crf
if outputFormat == "sequence":
rdd_final.saveAsSequenceFile(outputDirectory)
elif outputFormat == "text":
rdd_final.saveAsTextFile(outputDirectory)
else:
raise RuntimeError("Unrecognized output format: %s" % outputFormat)
示例3: predict
def predict(self, X):
""" Assumes X is an RDD or a list of (data, label) minibatch tuples."""
if isinstance(X, RDD):
# Distribute files
X.context.addFile(self._solver_filename)
X.context.addFile(self._architecture_filename)
X.mapPartitions(self.predict)
solver_filename = \
SparkFiles.get(self._solver_filename.rsplit('/', 1)[-1])
architecture_filename = \
SparkFiles.get(self._architecture_filename.rsplit('/', 1)[-1])
# Might need to modify path to architecture file inside solver file.
# Maybe we should do this before shipping the file since all Spark
# tmp directories will be identically named.
net = SGDSolver(solver_filename).net
for minibatch_data, minibatch_label in X:
# TODO: update function call for latest Caffe
net.set_input_arrays(minibatch_data,
minibatch_label,
self.input_index)
output = net.forward(end=self.score_blob)
scores = output[self.score_blob]
pred = np.argmax(scores, axis=1).squeeze()
yield pred
示例4: ship_prototxt_to_data
def ship_prototxt_to_data(self, rdd):
rdd.context.addFile(self._solver_filename)
rdd.context.addFile(self._architecture_filename)
solver_filename = \
SparkFiles.get(self._solver_filename.rsplit('/', 1)[-1])
architecture_filename = \
SparkFiles.get(self._architecture_filename.rsplit('/', 1)[-1])
return solver_filename, architecture_filename
示例5: compute_buried_area
def compute_buried_area(pdb_complex):
chZ = "chZ"
sasa_complex = -1.0
sasa_rec = -1.0
sasa_lig = -1.0
buried_total = -1.0
base_name = get_name_model_pdb(pdb_complex)
ligand_name = get_ligand_from_receptor_ligand_model(base_name)
f_pdb_ligand_no_docking = os.path.join(pdb_ligand_path.value,ligand_name+".pdb")
f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx")
f_temp_sasa_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_complex.xvg")
f_temp_sasa_rec = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_rec.xvg")
f_temp_sasa_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig.xvg")
# Makes the index file with the ligand (chain z) and the rest (non chain z)
script_make_ndx = SparkFiles.get("make_ndx_buried_area_total.sh") #Getting bash script that was copied by addFile command
command = script_make_ndx + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx
process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
stdout, stderr = process.communicate()
command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface System " + " -output System "+ " -xvg none " + " -o " + f_temp_sasa_complex
process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
stdout, stderr = process.communicate()
# Makes f_temp_sasa_rec file
script_make_sasa_rec = SparkFiles.get("make_sasa_rec_buried_area_total.sh") #Getting bash script that was copied by addFile command
command = script_make_sasa_rec + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " " + f_temp_sasa_rec
process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
stdout, stderr = process.communicate()
command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface chZ " + " -output chZ "+ " -xvg none " + " -o " + f_temp_sasa_lig
process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
stdout, stderr = process.communicate()
sasa_complex = get_value_from_xvg_sasa(f_temp_sasa_complex)
sasa_rec = get_value_from_xvg_sasa(f_temp_sasa_rec)
sasa_lig = get_value_from_xvg_sasa(f_temp_sasa_lig)
buried_total = sasa_rec + sasa_lig - sasa_complex
#Generating result - See column sorting because resultaed file will be created based on this sorting
returned_list = (base_name, buried_total)
#Deleting files
os.remove(f_ndx)
os.remove(f_temp_sasa_complex)
os.remove(f_temp_sasa_rec)
os.remove(f_temp_sasa_lig)
return returned_list
示例6: test_add_file_locally
def test_add_file_locally(self):
path = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt")
self.sc.addFile(path)
download_path = SparkFiles.get("hello.txt")
self.assertNotEqual(path, download_path)
with open(download_path) as test_file:
self.assertEqual("Hello World!\n", test_file.readline())
示例7: load_timestep
def load_timestep(timestep):
path = data_path
if download or config.copy_local:
path = SparkFiles.get('pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc')
data = Dataset(path)
pr = data.variables['pr']
step = pr[timestep]
# Return valid values
return (timestep, step[~step.mask])
示例8: spawn_barista
def spawn_barista(partition):
main = SparkFiles.get("main.py")
architecture = SparkFiles.get("train_val.prototxt")
model = SparkFiles.get("deepq16.caffemodel")
solver = SparkFiles.get("solver.prototxt")
root = SparkFiles.getRootDirectory()
dset = os.path.join(root, "dset.hdf5")
flag_file = "flags/__BARISTA_READY__"
if os.path.isfile(flag_file):
os.remove("flags/__BARISTA_READY__")
out = open(os.path.join(root, "barista.log"), 'w')
subprocess.Popen(["python", main, architecture, model,
"--dataset", dset,
"--solver", solver],
stdout=out,
stderr=subprocess.STDOUT)
while not os.path.isfile("flags/__BARISTA_READY__"):
pass
示例9: partitionIp2city
def partitionIp2city(iter):
from geoip2 import database
def ip2city(ip):
try:
city = reader.city(ip).city.name
except:
city = 'not found'
return city
reader = database.Reader(SparkFiles.get(geoDBpath))
#return [ip2city(ip) for ip in iter]
return ip2city(iter)
示例10: main
def main(sc):
sqlContext = SQLContext(sc)
df = sqlContext.jsonFile(DATA_PATH)
#add the filter file
sc.addFile(FILTER_TERMS_FILE_PATH)
filter_terms = sc.textFile(SparkFiles.get("freebase-symptoms-just-terms.txt"))
global filter_terms_set_bc
filter_terms_set_bc = sc.broadcast(Set(filter_terms.collect()))
# Register the DataFrame as a table.
df.registerTempTable("tweet")
results = sqlContext.sql("SELECT id,user.id,user.lang,created_at, coordinates,text FROM tweet where user.lang='en'")
#filter tweets to find health related tweets
filter_health_tweets = results.rdd.filter(healthFilter)
filter_health_tweets.mapPartitions(writeRecords).saveAsTextFile("output/")
示例11: compute_buried_area_ligand
def compute_buried_area_ligand(pdb_complex):
chZ = "chZ"
buried_lig_rec_perc = -1.0
buried_lig_rec = -1.0
buried_lig_lig = -1.0
buried_lig_lig_perc = -1.0
base_name = get_name_model_pdb(pdb_complex)
ligand_name = get_ligand_from_receptor_ligand_model(base_name)
receptor_name = get_receptor_from_receptor_ligand_model(base_name)
pose = get_model_from_receptor_ligand_model(base_name)
pdb_before_vs = os.path.join(pdb_ligand_path.value,ligand_name+".pdb")
#ndx files
f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx")
#xvg files
xvg_temp_sasa_lig_pose = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_pose"+".xvg")
xvg_temp_sasa_lig_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_complex"+".xvg")
xvg_temp_sasa_lig_min = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_min"+".xvg")
# Creates a selection with the residues that are closer than 6A to the ligand
script_make_ndx_buried_area_ligand = SparkFiles.get("make_ndx_buried_area_ligand.sh") #Getting bash script that was copied by addFile command
command = script_make_ndx_buried_area_ligand + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " "+ xvg_temp_sasa_lig_pose + " "+ str(probe.value) + " "+ str(ndots.value) + " "+ xvg_temp_sasa_lig_complex + " "+ pdb_before_vs + " "+ xvg_temp_sasa_lig_min
process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
stdout, stderr = process.communicate()
try:
# SASA of the isolated ligand in the pose conformation
sasa_lig_pose = get_value_from_xvg_sasa(xvg_temp_sasa_lig_pose)
# SASA of the complexed ligand in the pose conformation
sasa_lig_complex = get_value_from_xvg_sasa(xvg_temp_sasa_lig_complex)
# SASA of the isolated ligand in its energy-minimized conformation. Only for carbohydrates!
sasa_lig_min = get_value_from_xvg_sasa(xvg_temp_sasa_lig_min)
# Area of the ligand which is buried in the receptor
buried_lig_rec = sasa_lig_pose - sasa_lig_complex
buried_lig_rec_perc = buried_lig_rec / sasa_lig_pose
# Area of the ligand in the pose conformation which is buried in itself when compared to the energy-minimized conformation
buried_lig_lig = sasa_lig_min - sasa_lig_pose
buried_lig_lig_perc = buried_lig_lig / sasa_lig_min
returned_list = (base_name, buried_lig_rec, buried_lig_rec_perc, buried_lig_lig, buried_lig_lig_perc)
#Deleting files
os.remove(f_ndx)
os.remove(xvg_temp_sasa_lig_pose)
os.remove(xvg_temp_sasa_lig_complex)
os.remove(xvg_temp_sasa_lig_min)
return returned_list
except:
return (base_name, float(0.0), float(0.0), float(0.0), float(0.0))
示例12: partition_processor
def partition_processor(partitionlinechunks):
"""
Partition logic for pyspark parallel processing
"""
model_pipe_object = joblib.load(SparkFiles.get("mmp_phase1_D2.clf"))
def set_predictions(x):
segment = model_pipe_object.predict_proba(x)
return segment
df_with_nan = build_dataframe(partitionlinechunks)
df_with_newline = df_with_nan.replace(u"NULL", pd.np.nan)
behaviour_df = df_with_newline.replace(u"\\N", pd.np.nan)
predictions_ser = set_predictions(behaviour_df)
predictions_list = [value for value in [zip(predictions_ser.index, predictions_ser.loc[:,'A'], predictions_ser.loc[:,'Y'], predictions_ser.loc[:,'segment'], predictions_ser.loc[:,'model_version'])]]
return iter(predictions_list)
示例13: load_matrix
def load_matrix(
filename,
sc,
num_users=NUM_USER,
num_items=NUM_SONG
):
global alpha
global total
global num_zeros
print 'Start to load matrix...'
t0 = time.time()
counts = np.zeros((num_users, num_items))
total = 0.0
num_zeros = num_users * num_items
url = "s3n://spark-mllib/fastcode/data/" + filename
# url = "hdfs://localhost:9000/data/" + filename
print 'loading... ' + url
# data = sc.textFile(url)
# data.map(lambda l: fill_maxtrix(l, counts))
sc.addFile(url)
with open(SparkFiles.get(filename)) as f:
for line in f:
fill_maxtrix(line, counts)
alpha = num_zeros / total
print 'alpha %.2f' % alpha
counts *= alpha
t1 = time.time()
print 'Finished loading matrix in %f seconds\n' % (t1 - t0)
print 'Total entry:', num_users * num_items
print 'Non-zeros:', num_users * num_items - num_zeros
counts = sparse.csr_matrix(counts)
return counts, num_users * num_items - num_zeros
示例14: _getCountryByIP
def _getCountryByIP(ip):
citydb = geoIP.Reader(SparkFiles.get('GeoLite2-City.mmdb'))
return (citydb.city(ip).country.name or u'Unknown').encode()
示例15: hasDistInfo
distScript = os.getcwd()+"/src/R/finddistance.R"
distScriptName = "finddistance.R"
sc.addFile(distScript)
def hasDistInfo(call):
"""Verify that a call has the fields required to compute the distance"""
requiredFields = ["mylat", "mylong", "contactlat", "contactlong"]
return all(map(lambda f: call[f], requiredFields))
def formatCall(call):
"""Format a call so that it can be parsed by our R program"""
return "{0},{1},{2},{3}".format(
call["mylat"], call["mylong"],
call["contactlat"], call["contactlong"])
pipeInputs = contactsContactList.values().flatMap(
lambda calls: map(formatCall, filter(hasDistInfo, calls)))
distances = pipeInputs.pipe(SparkFiles.get(distScriptName))
print distances.collect()
# Convert our RDD of strings to numeric data so we can compute stats and
# remove the outliers.
distanceNumerics = distances.map(lambda string: float(string))
stats = distanceNumerics.stats()
stddev = stats.stdev()
mean = stats.mean()
reasonableDistances = distanceNumerics.filter(
lambda x: math.fabs(x - mean) < 3 * stddev)
print reasonableDistances.collect()