本文整理汇总了Python中pyspark.SparkFiles.get方法的典型用法代码示例。如果您正苦于以下问题:Python SparkFiles.get方法的具体用法?Python SparkFiles.get怎么用?Python SparkFiles.get使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkFiles
的用法示例。
在下文中一共展示了SparkFiles.get方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: crfexec
# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def crfexec(sc, inputFilename, outputDirectory,
limit=LIMIT, location='hdfs', outputFormat="text", partitions=None):
crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config")
crfExecutable = "/usr/local/bin/crf_test"
crfModelFilename = os.path.join(crfConfigDir, "dig-hair-eye-train.model")
rdd_pipeinput = sc.textFile(inputFilename)
rdd_pipeinput.setName('rdd_pipeinput')
# rdd_pipeinput.persist()
# DON'T USE SparkFiles.get to fetch the crf_test or model
# This only works with local Spark (--master local[*])
if location == 'hdfs':
cmd = "%s -m %s" % (os.path.basename(crfExecutable), os.path.basename(crfModelFilename))
elif location == 'local':
cmd = "%s -m %s" % (SparkFiles.get(os.path.basename(crfExecutable)), SparkFiles.get(os.path.basename(crfModelFilename)))
print "### %s" % cmd
rdd_crf = rdd_pipeinput.pipe(cmd)
rdd_final = rdd_crf
if outputFormat == "sequence":
rdd_final.saveAsSequenceFile(outputDirectory)
elif outputFormat == "text":
rdd_final.saveAsTextFile(outputDirectory)
else:
raise RuntimeError("Unrecognized output format: %s" % outputFormat)
示例2: train_partition
# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def train_partition(idx, iterator):
port = 50000 + idx % 256
main = SparkFiles.get("main.py")
architecture = SparkFiles.get("train_val.prototxt")
model = SparkFiles.get("deepq16.caffemodel")
solver = SparkFiles.get("solver.prototxt")
root = SparkFiles.getRootDirectory()
dset = os.path.join(root, "dset-%02d.hdf5" % idx)
flag_file = "flags/__BARISTA_READY__.%d" % port
if os.path.isfile(flag_file):
os.remove(flag_file)
# out = open(os.path.join(root, "barista.log"), 'w')
subprocess.Popen(["python", main, architecture, model,
"--dataset", dset,
"--solver", solver,
"--dset-size", "30000",
"--initial-replay", "20000",
"--debug",
"--overwrite",
"--port", str(port)])
while not os.path.isfile(flag_file):
pass
for step in iterator:
dc = DummyClient("127.0.0.1", port)
dc.send(barista.GRAD_UPDATE)
response = dc.recv()
yield response
示例3: predict
# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def predict(self, X):
""" Assumes X is an RDD or a list of (data, label) minibatch tuples."""
if isinstance(X, RDD):
# Distribute files
X.context.addFile(self._solver_filename)
X.context.addFile(self._architecture_filename)
X.mapPartitions(self.predict)
solver_filename = \
SparkFiles.get(self._solver_filename.rsplit('/', 1)[-1])
architecture_filename = \
SparkFiles.get(self._architecture_filename.rsplit('/', 1)[-1])
# Might need to modify path to architecture file inside solver file.
# Maybe we should do this before shipping the file since all Spark
# tmp directories will be identically named.
net = SGDSolver(solver_filename).net
for minibatch_data, minibatch_label in X:
# TODO: update function call for latest Caffe
net.set_input_arrays(minibatch_data,
minibatch_label,
self.input_index)
output = net.forward(end=self.score_blob)
scores = output[self.score_blob]
pred = np.argmax(scores, axis=1).squeeze()
yield pred
示例4: ship_prototxt_to_data
# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def ship_prototxt_to_data(self, rdd):
rdd.context.addFile(self._solver_filename)
rdd.context.addFile(self._architecture_filename)
solver_filename = \
SparkFiles.get(self._solver_filename.rsplit('/', 1)[-1])
architecture_filename = \
SparkFiles.get(self._architecture_filename.rsplit('/', 1)[-1])
return solver_filename, architecture_filename
示例5: compute_buried_area
# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def compute_buried_area(pdb_complex):
chZ = "chZ"
sasa_complex = -1.0
sasa_rec = -1.0
sasa_lig = -1.0
buried_total = -1.0
base_name = get_name_model_pdb(pdb_complex)
ligand_name = get_ligand_from_receptor_ligand_model(base_name)
f_pdb_ligand_no_docking = os.path.join(pdb_ligand_path.value,ligand_name+".pdb")
f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx")
f_temp_sasa_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_complex.xvg")
f_temp_sasa_rec = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_rec.xvg")
f_temp_sasa_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig.xvg")
# Makes the index file with the ligand (chain z) and the rest (non chain z)
script_make_ndx = SparkFiles.get("make_ndx_buried_area_total.sh") #Getting bash script that was copied by addFile command
command = script_make_ndx + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx
process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
stdout, stderr = process.communicate()
command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface System " + " -output System "+ " -xvg none " + " -o " + f_temp_sasa_complex
process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
stdout, stderr = process.communicate()
# Makes f_temp_sasa_rec file
script_make_sasa_rec = SparkFiles.get("make_sasa_rec_buried_area_total.sh") #Getting bash script that was copied by addFile command
command = script_make_sasa_rec + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " " + f_temp_sasa_rec
process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
stdout, stderr = process.communicate()
command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface chZ " + " -output chZ "+ " -xvg none " + " -o " + f_temp_sasa_lig
process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
stdout, stderr = process.communicate()
sasa_complex = get_value_from_xvg_sasa(f_temp_sasa_complex)
sasa_rec = get_value_from_xvg_sasa(f_temp_sasa_rec)
sasa_lig = get_value_from_xvg_sasa(f_temp_sasa_lig)
buried_total = sasa_rec + sasa_lig - sasa_complex
#Generating result - See column sorting because resultaed file will be created based on this sorting
returned_list = (base_name, buried_total)
#Deleting files
os.remove(f_ndx)
os.remove(f_temp_sasa_complex)
os.remove(f_temp_sasa_rec)
os.remove(f_temp_sasa_lig)
return returned_list
示例6: test_add_file_locally
# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def test_add_file_locally(self):
path = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt")
self.sc.addFile(path)
download_path = SparkFiles.get("hello.txt")
self.assertNotEqual(path, download_path)
with open(download_path) as test_file:
self.assertEqual("Hello World!\n", test_file.readline())
示例7: load_timestep
# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def load_timestep(timestep):
path = data_path
if download or config.copy_local:
path = SparkFiles.get('pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc')
data = Dataset(path)
pr = data.variables['pr']
step = pr[timestep]
# Return valid values
return (timestep, step[~step.mask])
示例8: spawn_barista
# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def spawn_barista(partition):
main = SparkFiles.get("main.py")
architecture = SparkFiles.get("train_val.prototxt")
model = SparkFiles.get("deepq16.caffemodel")
solver = SparkFiles.get("solver.prototxt")
root = SparkFiles.getRootDirectory()
dset = os.path.join(root, "dset.hdf5")
flag_file = "flags/__BARISTA_READY__"
if os.path.isfile(flag_file):
os.remove("flags/__BARISTA_READY__")
out = open(os.path.join(root, "barista.log"), 'w')
subprocess.Popen(["python", main, architecture, model,
"--dataset", dset,
"--solver", solver],
stdout=out,
stderr=subprocess.STDOUT)
while not os.path.isfile("flags/__BARISTA_READY__"):
pass
示例9: partitionIp2city
# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def partitionIp2city(iter):
from geoip2 import database
def ip2city(ip):
try:
city = reader.city(ip).city.name
except:
city = 'not found'
return city
reader = database.Reader(SparkFiles.get(geoDBpath))
#return [ip2city(ip) for ip in iter]
return ip2city(iter)
示例10: main
# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def main(sc):
sqlContext = SQLContext(sc)
df = sqlContext.jsonFile(DATA_PATH)
#add the filter file
sc.addFile(FILTER_TERMS_FILE_PATH)
filter_terms = sc.textFile(SparkFiles.get("freebase-symptoms-just-terms.txt"))
global filter_terms_set_bc
filter_terms_set_bc = sc.broadcast(Set(filter_terms.collect()))
# Register the DataFrame as a table.
df.registerTempTable("tweet")
results = sqlContext.sql("SELECT id,user.id,user.lang,created_at, coordinates,text FROM tweet where user.lang='en'")
#filter tweets to find health related tweets
filter_health_tweets = results.rdd.filter(healthFilter)
filter_health_tweets.mapPartitions(writeRecords).saveAsTextFile("output/")
示例11: compute_buried_area_ligand
# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def compute_buried_area_ligand(pdb_complex):
chZ = "chZ"
buried_lig_rec_perc = -1.0
buried_lig_rec = -1.0
buried_lig_lig = -1.0
buried_lig_lig_perc = -1.0
base_name = get_name_model_pdb(pdb_complex)
ligand_name = get_ligand_from_receptor_ligand_model(base_name)
receptor_name = get_receptor_from_receptor_ligand_model(base_name)
pose = get_model_from_receptor_ligand_model(base_name)
pdb_before_vs = os.path.join(pdb_ligand_path.value,ligand_name+".pdb")
#ndx files
f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx")
#xvg files
xvg_temp_sasa_lig_pose = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_pose"+".xvg")
xvg_temp_sasa_lig_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_complex"+".xvg")
xvg_temp_sasa_lig_min = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_min"+".xvg")
# Creates a selection with the residues that are closer than 6A to the ligand
script_make_ndx_buried_area_ligand = SparkFiles.get("make_ndx_buried_area_ligand.sh") #Getting bash script that was copied by addFile command
command = script_make_ndx_buried_area_ligand + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " "+ xvg_temp_sasa_lig_pose + " "+ str(probe.value) + " "+ str(ndots.value) + " "+ xvg_temp_sasa_lig_complex + " "+ pdb_before_vs + " "+ xvg_temp_sasa_lig_min
process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
stdout, stderr = process.communicate()
try:
# SASA of the isolated ligand in the pose conformation
sasa_lig_pose = get_value_from_xvg_sasa(xvg_temp_sasa_lig_pose)
# SASA of the complexed ligand in the pose conformation
sasa_lig_complex = get_value_from_xvg_sasa(xvg_temp_sasa_lig_complex)
# SASA of the isolated ligand in its energy-minimized conformation. Only for carbohydrates!
sasa_lig_min = get_value_from_xvg_sasa(xvg_temp_sasa_lig_min)
# Area of the ligand which is buried in the receptor
buried_lig_rec = sasa_lig_pose - sasa_lig_complex
buried_lig_rec_perc = buried_lig_rec / sasa_lig_pose
# Area of the ligand in the pose conformation which is buried in itself when compared to the energy-minimized conformation
buried_lig_lig = sasa_lig_min - sasa_lig_pose
buried_lig_lig_perc = buried_lig_lig / sasa_lig_min
returned_list = (base_name, buried_lig_rec, buried_lig_rec_perc, buried_lig_lig, buried_lig_lig_perc)
#Deleting files
os.remove(f_ndx)
os.remove(xvg_temp_sasa_lig_pose)
os.remove(xvg_temp_sasa_lig_complex)
os.remove(xvg_temp_sasa_lig_min)
return returned_list
except:
return (base_name, float(0.0), float(0.0), float(0.0), float(0.0))
示例12: partition_processor
# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def partition_processor(partitionlinechunks):
"""
Partition logic for pyspark parallel processing
"""
model_pipe_object = joblib.load(SparkFiles.get("mmp_phase1_D2.clf"))
def set_predictions(x):
segment = model_pipe_object.predict_proba(x)
return segment
df_with_nan = build_dataframe(partitionlinechunks)
df_with_newline = df_with_nan.replace(u"NULL", pd.np.nan)
behaviour_df = df_with_newline.replace(u"\\N", pd.np.nan)
predictions_ser = set_predictions(behaviour_df)
predictions_list = [value for value in [zip(predictions_ser.index, predictions_ser.loc[:,'A'], predictions_ser.loc[:,'Y'], predictions_ser.loc[:,'segment'], predictions_ser.loc[:,'model_version'])]]
return iter(predictions_list)
示例13: load_matrix
# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def load_matrix(
filename,
sc,
num_users=NUM_USER,
num_items=NUM_SONG
):
global alpha
global total
global num_zeros
print 'Start to load matrix...'
t0 = time.time()
counts = np.zeros((num_users, num_items))
total = 0.0
num_zeros = num_users * num_items
url = "s3n://spark-mllib/fastcode/data/" + filename
# url = "hdfs://localhost:9000/data/" + filename
print 'loading... ' + url
# data = sc.textFile(url)
# data.map(lambda l: fill_maxtrix(l, counts))
sc.addFile(url)
with open(SparkFiles.get(filename)) as f:
for line in f:
fill_maxtrix(line, counts)
alpha = num_zeros / total
print 'alpha %.2f' % alpha
counts *= alpha
t1 = time.time()
print 'Finished loading matrix in %f seconds\n' % (t1 - t0)
print 'Total entry:', num_users * num_items
print 'Non-zeros:', num_users * num_items - num_zeros
counts = sparse.csr_matrix(counts)
return counts, num_users * num_items - num_zeros
示例14: driver
# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def driver(sc, inputFilename, outputDirectory,
crfExecutable, crfScript,
featureListFilename, crfModelFilename,
eyeColorRef, eyeColorConfig, hairRef, hairConfig,
limit=limit, location='hdfs', outputFormat="text", partitions=None):
dump = False
partitions = None
# Program to compute CRF++
c = crf_features.CrfFeatures(featureListFilename)
# Add files to be downloaded with this Spark job on every node.
sc.addFile(crfExecutable)
sc.addFile(crfScript)
sc.addFile(crfModelFilename)
# Map to reference sets
smEyeColor = HybridJaccard(ref_path=eyeColorRef, config_path=eyeColorConfig)
smHairColor = HybridJaccard(ref_path=hairRef, config_path=hairConfig)
if location == "hdfs":
print "We want to do hdfs dfs -rm -r %s" % outputDirectory
elif location == "local":
try:
shutil.rmtree(outputDirectory)
print "rmtree %s" % outputDirectory
except:
pass
else:
raise RuntimeError("No such location: %s" % location)
rdd_sequence_file_input = sc.sequenceFile(inputFilename)
rdd_sequence_file_input.setName('rdd_sequence_file_input')
# rdd_sequence_file_input.persist()
origSize = rdd_sequence_file_input.count()
# if limit:
# rdd = sc.parallelize(rdd_sequence_file_input.take(limit))
if partitions:
rdd_sequence_file_input = rdd_sequence_file_input.repartition(partitions)
print "### input %s: %d ads (orig %s, limit was %s), %d partitions" % (inputFilename, rdd_sequence_file_input.count(), origSize, limit, rdd_sequence_file_input.getNumPartitions())
rdd_json = rdd_sequence_file_input.mapValues(lambda x: json.loads(x))
rdd_json.setName('rdd_json')
# rdd_json.persist()
# all below should also be done for title
rdd_body = rdd_json.mapValues(lambda x: extract_body(x))
rdd_body.setName('rdd_body')
# rdd_body.persist()
if dump:
rdd_body.saveAsTextFile(ff("body"))
rdd_body_tokens = rdd_body.mapValues(lambda x: textTokens(x))
rdd_body_tokens.setName('rdd_body_tokens')
# rdd_body_tokens.persist()
if dump:
rdd_body_tokens.saveAsTextFile(ff("body_tokens"))
rdd_features = rdd_body_tokens.map(lambda x: (x[0], c.computeFeatMatrix(x[1], False, addLabels=[x[0]], addIndex=True)))
rdd_features.setName('rdd_features')
# rdd_features.persist()
if dump:
rdd_features.saveAsTextFile(ff("features"))
# rdd_pipeinput = rdd_features.mapValues(lambda x: base64.b64encode(vectorToString(x)))
rdd_pipeinput = rdd_features.mapValues(lambda x: vectorToString(x))
rdd_pipeinput.setName('rdd_pipeinput')
# rdd_pipeinput.persist()
if dump:
rdd_pipeinput.values().saveAsTextFile(ff("pi"))
# This caused a cannot concatenate string + None error
# rdd_pipeinput.saveAsTextFile(outputDirectory + "-pipeinput")
# DON'T USE SparkFiles.get to fetch the crf_test or model
# This only works with local Spark (--master local[*])
if location == 'hdfs':
cmd = "%s %s" % (os.path.basename(crfScript), os.path.basename(crfModelFilename))
elif location == 'local':
cmd = "%s %s" % (SparkFiles.get(os.path.basename(crfScript)), SparkFiles.get(os.path.basename(crfModelFilename)))
print "### %s" % cmd
rdd_pipeinput.saveAsTextFile(ff("before"))
exit(0)
rdd_crf_b64 = rdd_pipeinput.values().pipe(cmd)
rdd_crf_b64.setName('rdd_crf_b64')
# rdd_crf_b64.persist()
if dump:
rdd_crf_b64.saveAsTextFile(ff("po"))
# Go directly from base64 output to a reconstructed tuple format mapping URI to vector of vectors,
# with empty string suffix indicating blank line
# This is key for avoiding the groupBy step
rdd_restore = rdd_crf_b64.map(lambda x: restore(x))
rdd_restore.setName('rdd_restore')
# rdd_restore.persist()
if dump:
rdd_restore.saveAsTextFile(ff("restore"))
# ### WE NO LONGER HAVE TO GROUPBY
# ### BUT WE MUST TREAT EACH LINE INDIVIDUALLY NOW
#.........这里部分代码省略.........
示例15: driver
# 需要导入模块: from pyspark import SparkFiles [as 别名]
# 或者: from pyspark.SparkFiles import get [as 别名]
def driver(sc, inputFilename, outputDirectory,
crfExecutable, crfScript,
featureListFilename, crfModelFilename,
eyeColorRef, eyeColorConfig, hairRef, hairConfig,
limit=limit, location='hdfs', outputFormat="text", partitions=None):
dump = False
partitions = 8
# Program to compute CRF++
c = crf_features.CrfFeatures(featureListFilename)
# Add files to be downloaded with this Spark job on every node.
sc.addFile(crfExecutable)
sc.addFile(crfScript)
sc.addFile(crfModelFilename)
# Map to reference sets
smEyeColor = HybridJaccard(ref_path=eyeColorRef, config_path=eyeColorConfig)
smHairColor = HybridJaccard(ref_path=hairRef, config_path=hairConfig)
if location == "hdfs":
print "We want to do hdfs dfs -rm -r %s" % outputDirectory
elif location == "local":
try:
shutil.rmtree(outputDirectory)
print "rmtree %s" % outputDirectory
except:
pass
else:
raise RuntimeError("No such location: %s" % location)
rdd_sequence_file_input = sc.sequenceFile(inputFilename)
rdd_sequence_file_input.setName('rdd_sequence_file_input')
# rdd_sequence_file_input.persist()
origSize = rdd_sequence_file_input.count()
# if limit:
# rdd = sc.parallelize(rdd_sequence_file_input.take(limit))
if partitions:
rdd_sequence_file_input = rdd_sequence_file_input.repartition(partitions)
print "### input %s: %d ads (orig %s, limit was %s), %d partitions" % (inputFilename, rdd_sequence_file_input.count(), origSize, limit, rdd_sequence_file_input.getNumPartitions())
if location == 'hdfs':
cmd = "%s %s" % (os.path.basename(crfScript), os.path.basename(crfModelFilename))
elif location == 'local':
cmd = "%s %s" % (SparkFiles.get(os.path.basename(crfScript)), SparkFiles.get(os.path.basename(crfModelFilename)))
print "### %s" % cmd
# ### WE NO LONGER HAVE TO GROUPBY
# ### BUT WE MUST TREAT EACH LINE INDIVIDUALLY NOW
# rdd_withuri = sc.parallelize(rdd_withuri.take(10))
rdd_final = rdd_sequence_file_input.mapValues(lambda x: json.loads(x)).mapValues(lambda x: extract_body(x)).mapValues(lambda x: textTokens(x)).map(lambda x: (x[0], c.computeFeatMatrix(x[1], False, addLabels=[x[0]], addIndex=True))).mapValues(lambda x: base64.b64encode(vectorToString(x))).values().pipe(cmd).map(lambda x: restore(x)).mapValues(lambda x: computeSpans(x, indexed=True)).filter(lambda p: p[1]).flatMapValues(lambda x: list(x)).mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor, "hairType": smHairColor})).mapValues(lambda x: json.dumps(x))
empty = rdd_final.isEmpty()
if not empty:
l = "unknown>1"
print "### writing %s output (%s records) to %s" % (outputFormat, l, outputDirectory)
print len(rdd_final.collect())
# if outputFormat == "sequence":
# rdd_final.saveAsSequenceFile(outputDirectory)
# elif outputFormat == "text":
# rdd_final.saveAsTextFile(outputDirectory)
# else:
# raise RuntimeError("Unrecognized output format: %s" % outputFormat)
else:
print "### No records: no output into %s" % (outputDirectory)