本文整理汇总了Python中pyspark.SparkContext.addFile方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.addFile方法的具体用法?Python SparkContext.addFile怎么用?Python SparkContext.addFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkContext
的用法示例。
在下文中一共展示了SparkContext.addFile方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: save_data_to_db
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
def save_data_to_db():
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
conf = SparkConf().setMaster("localhost")
sc = SparkContext("local[*]", "tikcket_mechine_gen")
sc.setLogLevel("WARN")
sc.addFile(lib_dir+'/getDistance.py')
data_used_by_ticket_mechine_gen.drop()
path = '/3/2014-10-15'
for s in stations:
full_path = data_dir_path+'v0/'+s+path
print full_path
data_to_save = getDistance.get_one_day_group_by_time(full_path, sc)
for item in data_to_save:
data_used_by_ticket_mechine_gen.insert({'station_name':s, 'time':item[0], 'data':item[1]})
示例2: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
def main():
# Configure Spark
conf = SparkConf()
conf.setAppName("Application name") # Specify the application name
conf.set("spark.jars", "file:/shared_data/spark_jars/hadoop-openstack-3.0.0-SNAPSHOT.jar") # Don't modify
sc = SparkContext(conf=conf) # Spark Context variable that will be used for all operations running on the cluster
parser = argparse.ArgumentParser()
parser.add_argument("backend", type=str)
parser.add_argument("helperpath", type=str)
parser.add_argument("shuffle_partitions", type=str)
parser.add_argument("params", type=str)
parser.add_argument("inputs", type=str)
parser.add_argument("features", type=str, nargs='?')
args = parser.parse_args()
# Swift Connection
if(args.backend == 'swift'):
hadoopConf = sc._jsc.hadoopConfiguration()
hadoopConf.set("fs.swift.impl", "org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem")
hadoopConf.set("fs.swift.service.SparkTest.auth.url", os.environ['OS_AUTH_URL'] + "/tokens")
hadoopConf.set("fs.swift.service.SparkTest.http.port", "8443")
hadoopConf.set("fs.swift.service.SparkTest.auth.endpoint.prefix", "/")
hadoopConf.set("fs.swift.service.SparkTest.region", os.environ['OS_REGION_NAME'])
hadoopConf.set("fs.swift.service.SparkTest.public", "false")
hadoopConf.set("fs.swift.service.SparkTest.tenant", os.environ['OS_TENANT_ID'])
hadoopConf.set("fs.swift.service.SparkTest.username", os.environ['OS_USERNAME'])
hadoopConf.set("fs.swift.service.SparkTest.password", os.environ['OS_PASSWORD'])
helperpath = str(args.helperpath) # This is passed by default
sc.addFile(helperpath + "/utils/helper.py") # To import custom modules
shuffle_partitions = args.shuffle_partitions
# Create a dict and pass it in your_module_implementation
params = json.loads(args.params)
inputs = json.loads(args.inputs)
features = json.loads(args.features) # Only used when you want to create a feature set
sqlContext = SQLContext(sc) # Create SQLContext var from SparkContext, To work with our default format of datasets i.e. Parquet
sqlContext.setConf("spark.sql.shuffle.partitions", shuffle_partitions) # Don't change, required for controlling parallelism
# Pass the sc (Spark Context) and sqlContext along with the different paramters and inputs.
module_implementation(sc, sqlContext, params=params, inputs=inputs, features=features)
示例3: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
def main():
### Initialize the SparkConf and SparkContext
### Locations of Python files.
sheets_loc = "/root/IdeaNets/Synapsify/Synapsify/loadCleanly/sheets.py"
lstm_class_loc = "/root/IdeaNets/IdeaNets/models/lstm/scode/lstm_class.py"
load_params_loc = "/root/IdeaNets/IdeaNets/models/lstm/scode/load_params.py"
preprocess_loc = "/root/IdeaNets/IdeaNets/models/lstm/scode/synapsify_preprocess.py"
### Pass Python files to Spark.
pyFiles = []
pyFiles.append(sheets_loc)
pyFiles.append(lstm_class_loc)
pyFiles.append(load_params_loc)
pyFiles.append(preprocess_loc)
### Automatically get the master node url from AWS, normally it is fixed.
cmd = ["./../../spark/ec2/spark-ec2", "-r", "us-east-1", "get-master", "ruofan-cluster"]
hostname = (
subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0].split("\n")[2]
) ### host name of the master node.
master_url = ""
master_url += "spark://"
master_url += hostname
master_url += ":7077"
# print master_url
### Initialize the spark configuration.
conf = SparkConf().setAppName("ruofan").setMaster(master_url)
sc = SparkContext(conf=conf, pyFiles=pyFiles)
### Add non-python files passing to Spark.
sc.addFile("/root/spark/bin/nonbreaking_prefix.en")
sc.addFile("/root/IdeaNets/IdeaNets/models/lstm/scode/tokenizer.perl")
sc.addFile("/root/IdeaNets/Synapsify/Synapsify/loadCleanly/stopwords.txt")
sc.addFile("/root/IdeaNets/Synapsify/Synapsify/loadCleanly/prepositions.txt")
datafile = sc.wholeTextFiles(
"s3n://synapsify-lstm/Synapsify_data1", use_unicode=False
) ### Read data directory from S3 storage.
### Sent the application in each of the slave node
datafile.foreach(lambda (path, content): lstm_test(path, content))
示例4: init_spark_context
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
def init_spark_context():
global predictionModel
# load spark context
conf = SparkConf().setAppName("movie_recommendation-server")
# IMPORTANT: pass aditional Python modules to each worker
sc = SparkContext(conf=conf, pyFiles=['webapp.py', 'service_func.py'])
# absolute path in hdfs
# to run locally, remove first slash '/' i.e my_model1, not /my_model1
predictionModel = DecisionTreeModel.load(sc, '/my_model1')
sc.addFile( 'conv/6.p')
sc.addFile( 'conv/7.p')
sc.addFile( 'conv/8.p')
sc.addFile('conv/10.p')
sc.addFile('conv/12.p')
sc.addFile( 'conv/36.p')
return sc
示例5: sc
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
def sc(self): # noqa
if not self._spark_context:
spark_context = SparkContext(conf=self.spark_config)
assert self.spex_conf.spex_file is not None, "The spex builder must be broken I do not know my spex conf!"
spark_context.addFile(self.spex_conf.spex_file)
for py_file in self.spex_conf.spark_config.py_files:
spark_context.addPyFile(py_file)
for file in self.spex_conf.spark_config.files: # noqa
spark_context.addFile(file)
for jar in self.spex_conf.spark_config.jars: # noqa
spark_context.addFile(jar)
self._spark_context = spark_context
print_banner(self)
return self._spark_context
示例6: get_ingestion_start_end_id
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
try:
c_options = parser.parse_args()
print "Got options:", c_options
except Exception as inst:
print inst
parser.print_help()
es_ts_start, es_ts_end, ingestion_id = get_ingestion_start_end_id(c_options)
# Setup SparkContext
sc = SparkContext(appName="extract-features-"+ingestion_id+job_suffix)
sc.addPyFile('hdfs://memex/user/skaraman/extract-features/network.py')
sc.addPyFile('hdfs://memex/user/skaraman/extract-features/tfdeepsentibank.py')
sc.addFile('hdfs://memex/user/skaraman/extract-features/imagenet_mean.npy')
sc.addFile('hdfs://memex/user/skaraman/extract-features/tfdeepsentibank.npy')
conf = SparkConf()
log4j = sc._jvm.org.apache.log4j
log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)
# Setup HBase managers
# just to be sure we will be able to write out to the table
get_create_table(c_options.tab_sha1_infos_name, c_options)
get_create_table(c_options.tab_update_name, c_options)
# hbase managers
hbase_fullhost = c_options.hbase_host+':'+str(c_options.hbase_port)
hbase_man_sha1infos_out = HbaseManager(sc, conf, hbase_fullhost, c_options.tab_sha1_infos_name)
hbase_man_update_out = HbaseManager(sc, conf, hbase_fullhost, c_options.tab_update_name)
# Run extraction
示例7: SparkConf
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
PY_FILES = ['settings.py', 'stanford_segmenter.py', 'pos_tag.py', 'logger.py', 'pymongo_spark.py']
FILES = ['NER.model']
CONF = {'spark.driver.extraClassPath':
os.environ['HOME'] + 'mongo-hadoop/spark/build/libs/mongo-hadoop-spark.jar'}
STANFORD_SEGMENTER = APP_HOME + '/stanford_segmenter'
STANFORD_POSTAGGER = APP_HOME + '/stanford-postagger'
STANFORD_MODELS = STANFORD_POSTAGGER + '/models'
LOG_DIR = 'log'
MONGO_SERVER = 'localhost'
MONGO_PORT = 27017
DB = 'tweets_data'
### Prepare SparkContext ###
conf = SparkConf().setAppName(APP_NAME)
for prop, val in CONF.items(): #set configuration properties
conf.set(prop, val)
sc = SparkContext(conf=conf, environment=ENV_VARS)
for f in PY_FILES: #add dependencies
sc.addPyFile('%s://%s/%s' %(FILESYSTEM, APP_HOME, f))
for f in FILES: #add required files
sc.addFile('%s://%s/%s' %(FILESYSTEM, APP_HOME, f))
pymongo_spark.activate()
示例8: predict
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
def predict(row_coord,cSize,model):
vector_dict={}
for w in row_coord[1]:
vector_dict[int(w[1])]=w[0]
return (row_coord[0], model.value.predict(SparseVector(cSize.value,vector_dict)))
trainF="./data/train" #the path to where the train data is
testF="./data/test" # the path to the unlabelled data
saveF="./predictions.txt" #where to save the predictions
sc = SparkContext(appName=" \--(o_o)--/ ") #initialize the spark context
#since we are not in the command line interface we need to add to the spark context
#some of our classes so that they are available to the workers
sc.addFile("/home/julien.hamilius/datacamp/code/helpers.py")
sc.addFile("/home/julien.hamilius/datacamp/code/extract_terms.py")
#now if we import these files they will also be available to the workers
from helpers import *
import extract_terms as et
# load data : data is a list with the text per doc in each cell. Y is the respective class value
#1 :positive , 0 negative
print "loading local data"
data,Y=lf.loadLabeled(trainF)
print "preprocessing"
pp.proc(data) #clean up the data from number, html tags, punctuations (except for "?!." ...."?!" are replaced by "."
m = TfidfVectorizer(analyzer=et.terms) # m is a compressed matrix with the tfidf matrix the terms are extracted with our own custom function
示例9: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
def main():
config = configparser.ConfigParser()
config.read('config.ini')
#Path for Gromacs project
gromacs_path = preparing_path(config.get('DRUGDESIGN', 'gromacs_path'))
#Path where PDB ligand are - They are NOT participated in docking
pdb_ligand_path = config.get('DEFAULT', 'pdb_ligand_path')
#Path that contains all files for analysis
path_analysis = config.get('DEFAULT', 'path_analysis')
#Path where all pdb receptor are
path_receptor_pdb = config.get('DEFAULT', 'pdb_path')
#Path for saving pdb files of models generated by VS
path_analysis_pdb = get_directory_pdb_analysis(path_analysis)
# Create SPARK config
maxResultSize = str(config.get('SPARK', 'maxResultSize'))
conf = (SparkConf().set("spark.driver.maxResultSize", maxResultSize))
# Create context
sc = SparkContext(conf=conf)
#Adding Python Source file
#Path for drugdesign project
path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')
sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py"))
sc.addPyFile(os.path.join(path_spark_drugdesign,"os_utils.py"))
sc.addPyFile(os.path.join(path_spark_drugdesign,"gromacs_utils.py"))
sc.addPyFile(os.path.join(path_spark_drugdesign,"pdb_io.py"))
sc.addPyFile(os.path.join(path_spark_drugdesign,"json_utils.py"))
#Adding bash scripts
sc.addFile(os.path.join(path_spark_drugdesign,"make_ndx_buried_area_receptor.sh"))
sc.addFile(os.path.join(path_spark_drugdesign,"make_ndx_buried_area_receptor_res.sh"))
#Parameters form command line
#Indicates probe. Example: 0.14
#probe = float(sys.argv[1])
#Indicates ndots. Example: 24
#ndots = int(sys.argv[2])
#Broadcast
path_analysis_pdb_complex_b = sc.broadcast(path_analysis_pdb)
gromacs_path = sc.broadcast(gromacs_path)
pdb_ligand_path = sc.broadcast(pdb_ligand_path)
#probe = sc.broadcast(probe)
#ndots = sc.broadcast(ndots)
start_time = datetime.now()
os.environ["GMX_MAXBACKUP"]="-1"
#Loading all PDB receptor files into memory
list_all_pdb_receptor_files_path = []
all_receptor_for_complex = get_files_pdb(path_receptor_pdb)
for receptor in all_receptor_for_complex:
list_all_pdb_receptor_files_path.append(loading_pdb_2_list(receptor))
for pdb_receptor_files in list_all_pdb_receptor_files_path:
#Getting receptor name by fully path
base_file_name_receptor = get_name_receptor_pdb(str(pdb_receptor_files[0]))
#PDB file loaded into memory is sent by broadcast
pdb_file_receptor = pdb_receptor_files[1]
pdb_file_receptor = sc.broadcast(pdb_file_receptor)
#Loading PDB model files based on receptor into memory
base_file_name_receptor_for_filter = base_file_name_receptor+"_-_"
all_model_for_complex = get_files_pdb_filter(path_analysis_pdb,base_file_name_receptor_for_filter)
all_model_for_complexRDD = sc.parallelize(all_model_for_complex)
all_model_filesRDD = all_model_for_complexRDD.map(loading_pdb_2_list).collect()
# ********** Starting function **********************************************************
def save_model_receptor(list_receptor_model_file):
receptor_file = pdb_file_receptor.value #Obtained from broadcast
model_file = list_receptor_model_file[0]
full_path_for_save_complex = list_receptor_model_file[1]
#Open file for writting the complex
f_compl = open(full_path_for_save_complex, "w")
#Insert lines of receptor
for item in receptor_file:
f_compl.write(item)
#Insert lines of model and insert Z chain
for item in model_file:
item = replace_chain_atom_line(item,"d","z")
f_compl.write(item)
f_compl.close()
# ********** Finish function **********************************************************
# ********** Starting function **********************************************************
def compute_buried_area_all_residues_and_receptor_area(pdb_complex):
chZ = "chZ"
res_buried_area_perc = -1
res_buried_area = -1
buried_receptor_system = -1
buried_receptor_res = -1
base_name = get_name_model_pdb(pdb_complex)
ligand_name = get_ligand_from_receptor_ligand_model(base_name)
receptor_name = get_receptor_from_receptor_ligand_model(base_name)
pose = get_model_from_receptor_ligand_model(base_name)
#.........这里部分代码省略.........
示例10: SparkContext
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
sys.path.append(SPARK_HOME_PYTHON)
from pyspark import SparkContext
from pyspark import SparkConf
sc = SparkContext(appName = 'topXIp')
#test local speed: only around 75s, much faster
#sc = SparkContext('local' , 'topXIp')
#X = sys.argv[1]
#normal
normalFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv'
normalPath = os.path.join(normalFilePath)
sc.addFile(normalPath);
#attack
attackFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv'
attackPath = os.path.join(attackFilePath)
sc.addFile(attackPath);
from pyspark import SparkFiles
normalRdd = sc.textFile(SparkFiles.get(normalFilePath)).cache()
attackRdd = sc.textFile(SparkFiles.get(attackFilePath)).cache()
# src, dst, data_length, protocol_name, protocol_number, arrival_time (len = 6)
normalRaw = normalRdd.map(lambda x: x.split(',')).filter(lambda x: len(x) == 6).cache()
attackRaw = attackRdd.map(lambda x: x.split(',')).filter(lambda x: len(x) == 6).cache()
示例11: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
def main():
config = configparser.ConfigParser()
config.read('config.ini')
#Path for Gromacs project
gromacs_path = preparing_path(config.get('DRUGDESIGN', 'gromacs_path'))
#Path where PDB ligand are - They are NOT participated in docking
pdb_ligand_path = config.get('DEFAULT', 'pdb_ligand_path')
#Path that contains all files for analysis
path_analysis = config.get('DEFAULT', 'path_analysis')
#Path where all pdb receptor are
path_receptor_pdb = config.get('DEFAULT', 'pdb_path')
#Path for saving pdb files of models generated by VS
path_analysis_pdb = get_directory_pdb_analysis(path_analysis)
# Create SPARK config
maxResultSize = str(config.get('SPARK', 'maxResultSize'))
conf = (SparkConf().set("spark.driver.maxResultSize", maxResultSize))
# Create context
sc = SparkContext(conf=conf)
#Adding Python Source file
#Path for drugdesign project
path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')
sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py"))
sc.addPyFile(os.path.join(path_spark_drugdesign,"os_util.py"))
sc.addPyFile(os.path.join(path_spark_drugdesign,"gromacs_utils.py"))
sc.addPyFile(os.path.join(path_spark_drugdesign,"pdb_io.py"))
#Adding bash scripts
sc.addFile(os.path.join(path_spark_drugdesign,"make_ndx_buried_area_ligand.sh"))
#Parameters form command line
#Indicates probe. Example: 0.14
probe = float(sys.argv[1])
#Indicates ndots. Example: 24
ndots = int(sys.argv[2])
#Broadcast
path_analysis_pdb_complex_b = sc.broadcast(path_analysis_pdb)
gromacs_path = sc.broadcast(gromacs_path)
pdb_ligand_path = sc.broadcast(pdb_ligand_path)
probe = sc.broadcast(probe)
ndots = sc.broadcast(ndots)
start_time = datetime.now()
os.environ["GMX_MAXBACKUP"]="-1"
#Loading all PDB receptor files into memory
list_all_pdb_receptor_files_path = []
all_receptor_for_complex = get_files_pdb(path_receptor_pdb)
for receptor in all_receptor_for_complex:
list_all_pdb_receptor_files_path.append(loading_pdb_2_list(receptor))
for pdb_receptor_files in list_all_pdb_receptor_files_path:
#Getting receptor name by fully path
base_file_name_receptor = get_name_receptor_pdb(str(pdb_receptor_files[0]))
#PDB file loaded into memory is sent by broadcast
pdb_file_receptor = pdb_receptor_files[1]
pdb_file_receptor = sc.broadcast(pdb_file_receptor)
#Loading PDB model files based on receptor into memory
base_file_name_receptor_for_filter = base_file_name_receptor+"_-_"
all_model_for_complex = get_files_pdb_filter(path_analysis_pdb,base_file_name_receptor_for_filter)
all_model_for_complexRDD = sc.parallelize(all_model_for_complex)
all_model_filesRDD = all_model_for_complexRDD.map(loading_pdb_2_list).collect()
# ********** Starting function **********************************************************
def save_model_receptor(list_receptor_model_file):
receptor_file = pdb_file_receptor.value #Obtained from broadcast
model_file = list_receptor_model_file[0]
full_path_for_save_complex = list_receptor_model_file[1]
#Open file for writting the complex
f_compl = open(full_path_for_save_complex, "w")
#Insert lines of receptor
for item in receptor_file:
f_compl.write(item)
#Insert lines of model and insert Z chain
for item in model_file:
item = replace_chain_atom_line(item,"d","z")
f_compl.write(item)
f_compl.close()
# ********** Finish function **********************************************************
# ********** Starting function **********************************************************
def compute_buried_area_ligand(pdb_complex):
chZ = "chZ"
buried_lig_rec_perc = -1.0
buried_lig_rec = -1.0
buried_lig_lig = -1.0
buried_lig_lig_perc = -1.0
base_name = get_name_model_pdb(pdb_complex)
ligand_name = get_ligand_from_receptor_ligand_model(base_name)
receptor_name = get_receptor_from_receptor_ligand_model(base_name)
pose = get_model_from_receptor_ligand_model(base_name)
pdb_before_vs = os.path.join(pdb_ligand_path.value,ligand_name+".pdb")
#ndx files
f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx")
#.........这里部分代码省略.........
示例12: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
def main():
conf = (SparkConf()
.setMaster("local[*]")
.setAppName("compare_engine"))
sc = SparkContext(conf = conf)
sc.setLogLevel('INFO')
sc.addFile(primary)
# rdd_primary = sc.textFile(primary, minPartitions=4, use_unicode=True).distinct()
rdd_primary = sc.textFile(SparkFiles.get(primary), minPartitions=4, use_unicode=True).distinct()
rdd_primary.partitionBy(10).cache()
os.system('rm -Rf collects_*')
os.system('rm -Rf holder.txt')
rdd_secondary = sc.textFile(secondary, minPartitions=4, use_unicode=True).distinct()
rdd_secondary.partitionBy(10).cache()
primary_count = rdd_primary.count()
primary_report['count'] = primary_count
print(primary_report)
secondary_count = rdd_secondary.count()
secondary_report['count'] = secondary_count
print(secondary_report)
# Return each Primary file line/record not contained in Secondary
not_in_primary = rdd_primary.subtract(rdd_secondary)
primary_diff = not_in_primary.count()
primary_report['diff'] = primary_diff
os.system('rm -Rf collects_*.csv')
primary_dir = 'collects_{}_primary'.format(run_date)
primary_report_name = 'collects_{}_primary_report.csv'.format(run_date)
not_in_primary.coalesce(1, True).saveAsTextFile(primary_dir)
# os.system('cat collects_{}_primary/part-0000* >> collects_{}_primary_report.csv'.format(run_date, run_date))
os.system('cat {}/part-0000* >> {}'.format(primary_dir, primary_report_name))
os.system('wc -l collects_{}_primary_report.csv'.format(run_date))
# Flip Primary Vs Secondary
# Return each Secondary file line/record not contained in Primary
not_in_secondary = rdd_secondary.subtract(rdd_primary)
secondary_diff = not_in_secondary.count()
secondary_report['diff'] = secondary_diff
not_in_secondary.coalesce(1,True).saveAsTextFile('collects_{}_secondary'.format(run_date))
os.system('cat collects_{}_secondary/part-0000* >> collects_{}_secondary_report.csv'.format(run_date, run_date))
os.system('wc -l collects_{}_secondary_report.csv'.format(run_date))
process_report['primary'] = primary_report
process_report['secondary'] = secondary_report
print("=" * 100)
print('\n')
print(process_report)
print('\n')
print("=" * 100)
spark_details(sc)
sc.stop()
示例13: SparkConf
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
# Dummy Spark App demo
from pyspark import SparkContext, SparkConf
from pyspark import SparkFiles
import numpy as np
from barista.customer import Customer
conf = SparkConf().setAppName("Dummy Demo")
sc = SparkContext(conf=conf)
# Add prototxt files to Spark Context
sc.addFile("models/solver.prototxt")
sc.addFile("models/train_val.prototxt")
# Add barista module
sc.addPyFile("barista.zip")
sc.addPyFile("barista/start.py")
# Subclass generic barista Customer
class MyCustomer(Customer):
def __init__(self, filename):
compute_semaphore, model_semaphore, handles = \
Customer.parse_ipc_interface_file(filename)
Customer.__init__(self, compute_semaphore, model_semaphore, handles)
def update_data(self):
self.arrays['data'][:] = np.random.randn(*self.arrays['data'].shape)
self.arrays['label'][:] = np.random.choice(
xrange(10),
size=self.arrays['label'].shape)
示例14: SparkContext
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
spark_config.setExecutorEnv('AWS_ACCESS_KEY_ID', config.access_key)
spark_config.setExecutorEnv('AWS_SECRET_ACCESS_KEY ', config.secret_access_key)
# Build up the context, using the master URL
sc = SparkContext('spark://ulex:7077', 'mean', conf=spark_config)
local_data_path = '/media/bitbucket/pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc'
data_path = local_data_path
data_url = 'https://nasanex.s3.amazonaws.com/NEX-DCP30/BCSD/rcp26/mon/atmos/pr/r1i1p1/v1.0/CONUS/pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc'
if download:
data_path = data_url
# Download the file onto each node
if download or config.copy_local:
sc.addFile(data_path)
# Still need to open dataset on master node to get number of timesteps. For
# some reason the master node doesn't seem to be able to access the downloaded
# version, this may be a bug in addFile(...)
data = Dataset(local_data_path)
pr = data.variables['pr']
# Get the number of timesteps
num_timesteps = data.variables['time'].size
data.close()
# Now partition timesteps across the cluster
timesteps = sc.parallelize(range(0, num_timesteps), 30)
示例15: SparkContext
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
limit = 50
if __name__ == "__main__":
sc = SparkContext(appName="MTurk")
inputFilename = sys.argv[1]
outputDirectory = sys.argv[2]
featureListFilename = sys.argv[3]
crfModelFilename = sys.argv[4]
eyeRef = sys.argv[5]
eyeConfig = sys.argv[6]
hairRef = sys.argv[7]
hairConfig = sys.argv[8]
# Program to compute CRF++
c = crf_features.CrfFeatures(featureListFilename)
# Add files to be downloaded with this Spark job on every node.
sc.addFile("/usr/local/bin/crf_test")
sc.addFile(crfModelFilename)
# Map to reference sets
smEye = HybridJaccard(ref_path=eyeRef, config_path=eyeConfig)
smHair = HybridJaccard(ref_path=hairRef, config_path=hairConfig)
rdd = sc.sequenceFile(inputFilename)
if limit:
rdd = sc.parallelize(rdd.take(limit))
rdd_json = rdd.mapValues(lambda x: json.loads(x))
rdd_body = rdd_json.mapValues(lambda x: extract_body(x))
rdd_body_tokens = rdd_body.mapValues(lambda x: textTokens(x))