Python SparkContext.addFile方法代码示例

本文整理汇总了Python中pyspark.SparkContext.addFile方法的典型用法代码示例。如果您正苦于以下问题：Python SparkContext.addFile方法的具体用法？Python SparkContext.addFile怎么用？Python SparkContext.addFile使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkContext的用法示例。

在下文中一共展示了SparkContext.addFile方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: save_data_to_db

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
def save_data_to_db():
    from pyspark import SparkContext, SparkConf
    from pyspark.streaming import StreamingContext

    conf = SparkConf().setMaster("localhost")
    sc = SparkContext("local[*]", "tikcket_mechine_gen")
    sc.setLogLevel("WARN")
    sc.addFile(lib_dir+'/getDistance.py')

    data_used_by_ticket_mechine_gen.drop()
    path = '/3/2014-10-15'
    for s in stations:
        full_path = data_dir_path+'v0/'+s+path
        print full_path
        data_to_save = getDistance.get_one_day_group_by_time(full_path, sc)
        for item in data_to_save:
            data_used_by_ticket_mechine_gen.insert({'station_name':s, 'time':item[0], 'data':item[1]})

开发者ID:sklaw，项目名称:spark_project，代码行数:19，代码来源:ticket_machine_gen.py

示例2: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
def main():

    # Configure Spark
    conf = SparkConf()
    conf.setAppName("Application name")  # Specify the application name
    conf.set("spark.jars", "file:/shared_data/spark_jars/hadoop-openstack-3.0.0-SNAPSHOT.jar")  # Don't modify
    sc = SparkContext(conf=conf)  # Spark Context variable that will be used for all operations running on the cluster

    parser = argparse.ArgumentParser()
    parser.add_argument("backend", type=str)
    parser.add_argument("helperpath", type=str)
    parser.add_argument("shuffle_partitions", type=str)
    parser.add_argument("params", type=str)
    parser.add_argument("inputs", type=str)
    parser.add_argument("features", type=str, nargs='?')

    args = parser.parse_args()

    # Swift Connection
    if(args.backend == 'swift'):
        hadoopConf = sc._jsc.hadoopConfiguration()
        hadoopConf.set("fs.swift.impl", "org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem")
        hadoopConf.set("fs.swift.service.SparkTest.auth.url", os.environ['OS_AUTH_URL'] + "/tokens")
        hadoopConf.set("fs.swift.service.SparkTest.http.port", "8443")
        hadoopConf.set("fs.swift.service.SparkTest.auth.endpoint.prefix", "/")
        hadoopConf.set("fs.swift.service.SparkTest.region", os.environ['OS_REGION_NAME'])
        hadoopConf.set("fs.swift.service.SparkTest.public", "false")
        hadoopConf.set("fs.swift.service.SparkTest.tenant", os.environ['OS_TENANT_ID'])
        hadoopConf.set("fs.swift.service.SparkTest.username", os.environ['OS_USERNAME'])
        hadoopConf.set("fs.swift.service.SparkTest.password", os.environ['OS_PASSWORD'])

    helperpath = str(args.helperpath)  # This is passed by default
    sc.addFile(helperpath + "/utils/helper.py")  # To import custom modules
    shuffle_partitions = args.shuffle_partitions

    # Create a dict and pass it in your_module_implementation
    params = json.loads(args.params)
    inputs = json.loads(args.inputs)
    features = json.loads(args.features)  # Only used when you want to create a feature set

    sqlContext = SQLContext(sc)  # Create SQLContext var from SparkContext, To work with our default format of datasets i.e. Parquet
    sqlContext.setConf("spark.sql.shuffle.partitions", shuffle_partitions)  # Don't change, required for controlling parallelism

    # Pass the sc (Spark Context) and sqlContext along with the different paramters and inputs.
    module_implementation(sc, sqlContext, params=params, inputs=inputs, features=features)

开发者ID:CSC-IT-Center-for-Science，项目名称:spark-analysis，代码行数:47，代码来源:moduletemplate.py

示例3: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
def main():
    ### Initialize the SparkConf and SparkContext

    ### Locations of Python files.
    sheets_loc = "/root/IdeaNets/Synapsify/Synapsify/loadCleanly/sheets.py"
    lstm_class_loc = "/root/IdeaNets/IdeaNets/models/lstm/scode/lstm_class.py"
    load_params_loc = "/root/IdeaNets/IdeaNets/models/lstm/scode/load_params.py"
    preprocess_loc = "/root/IdeaNets/IdeaNets/models/lstm/scode/synapsify_preprocess.py"

    ### Pass Python files to Spark.
    pyFiles = []
    pyFiles.append(sheets_loc)
    pyFiles.append(lstm_class_loc)
    pyFiles.append(load_params_loc)
    pyFiles.append(preprocess_loc)

    ### Automatically get the master node url from AWS, normally it is fixed.
    cmd = ["./../../spark/ec2/spark-ec2", "-r", "us-east-1", "get-master", "ruofan-cluster"]
    hostname = (
        subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0].split("\n")[2]
    )  ### host name of the master node.
    master_url = ""
    master_url += "spark://"
    master_url += hostname
    master_url += ":7077"
    # print master_url
    ### Initialize the spark configuration.
    conf = SparkConf().setAppName("ruofan").setMaster(master_url)
    sc = SparkContext(conf=conf, pyFiles=pyFiles)

    ### Add non-python files passing to Spark.
    sc.addFile("/root/spark/bin/nonbreaking_prefix.en")
    sc.addFile("/root/IdeaNets/IdeaNets/models/lstm/scode/tokenizer.perl")
    sc.addFile("/root/IdeaNets/Synapsify/Synapsify/loadCleanly/stopwords.txt")
    sc.addFile("/root/IdeaNets/Synapsify/Synapsify/loadCleanly/prepositions.txt")

    datafile = sc.wholeTextFiles(
        "s3n://synapsify-lstm/Synapsify_data1", use_unicode=False
    )  ### Read data directory from S3 storage.

    ### Sent the application in each of the slave node
    datafile.foreach(lambda (path, content): lstm_test(path, content))

开发者ID:GeneralInfluence，项目名称:Spark，代码行数:44，代码来源:spark_test.py

示例4: init_spark_context

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
def init_spark_context():

    global predictionModel

    # load spark context
    conf = SparkConf().setAppName("movie_recommendation-server")

    # IMPORTANT: pass aditional Python modules to each worker
    sc = SparkContext(conf=conf, pyFiles=['webapp.py', 'service_func.py'])

    # absolute path in hdfs
    # to run locally, remove first slash '/' i.e my_model1, not /my_model1

    predictionModel = DecisionTreeModel.load(sc, '/my_model1')
    sc.addFile( 'conv/6.p')
    sc.addFile( 'conv/7.p')
    sc.addFile( 'conv/8.p')
    sc.addFile('conv/10.p')
    sc.addFile('conv/12.p')
    sc.addFile( 'conv/36.p')

    return sc

开发者ID:IcedNecro，项目名称:AWO-61-backend，代码行数:24，代码来源:server.py

示例5: sc

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
    def sc(self):  # noqa
        if not self._spark_context:
            spark_context = SparkContext(conf=self.spark_config)

            assert self.spex_conf.spex_file is not None, "The spex builder must be broken I do not know my spex conf!"
            spark_context.addFile(self.spex_conf.spex_file)

            for py_file in self.spex_conf.spark_config.py_files:
                spark_context.addPyFile(py_file)

            for file in self.spex_conf.spark_config.files:  # noqa
                spark_context.addFile(file)

            for jar in self.spex_conf.spark_config.jars:  # noqa
                spark_context.addFile(jar)

            self._spark_context = spark_context
            print_banner(self)
        return self._spark_context

开发者ID:GregBowyer，项目名称:spex，代码行数:21，代码来源:context.py

示例6: get_ingestion_start_end_id

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
    
    try:
        c_options = parser.parse_args()
        print "Got options:", c_options
    except Exception as inst:
        print inst
        parser.print_help()
    
    es_ts_start, es_ts_end, ingestion_id = get_ingestion_start_end_id(c_options)


    # Setup SparkContext    
    sc = SparkContext(appName="extract-features-"+ingestion_id+job_suffix)
    sc.addPyFile('hdfs://memex/user/skaraman/extract-features/network.py')
    sc.addPyFile('hdfs://memex/user/skaraman/extract-features/tfdeepsentibank.py')
    sc.addFile('hdfs://memex/user/skaraman/extract-features/imagenet_mean.npy')
    sc.addFile('hdfs://memex/user/skaraman/extract-features/tfdeepsentibank.npy')
    conf = SparkConf()
    log4j = sc._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)
    
    # Setup HBase managers
    # just to be sure we will be able to write out to the table
    get_create_table(c_options.tab_sha1_infos_name, c_options)
    get_create_table(c_options.tab_update_name, c_options)
    # hbase managers
    hbase_fullhost = c_options.hbase_host+':'+str(c_options.hbase_port)
    hbase_man_sha1infos_out = HbaseManager(sc, conf, hbase_fullhost, c_options.tab_sha1_infos_name)
    hbase_man_update_out = HbaseManager(sc, conf, hbase_fullhost, c_options.tab_update_name)
    
    # Run extraction

开发者ID:svebk，项目名称:DeepSentiBank_memex，代码行数:33，代码来源:extract-features.py

示例7: SparkConf

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
PY_FILES = ['settings.py', 'stanford_segmenter.py', 'pos_tag.py', 'logger.py', 'pymongo_spark.py']
FILES = ['NER.model']
CONF = {'spark.driver.extraClassPath':
        os.environ['HOME'] + 'mongo-hadoop/spark/build/libs/mongo-hadoop-spark.jar'}

STANFORD_SEGMENTER = APP_HOME + '/stanford_segmenter'
STANFORD_POSTAGGER = APP_HOME + '/stanford-postagger'
STANFORD_MODELS = STANFORD_POSTAGGER + '/models'

LOG_DIR = 'log'

MONGO_SERVER = 'localhost'
MONGO_PORT = 27017
DB = 'tweets_data'

### Prepare SparkContext ###
conf = SparkConf().setAppName(APP_NAME)

for prop, val in CONF.items():          #set configuration properties
    conf.set(prop, val)         

sc = SparkContext(conf=conf, environment=ENV_VARS)

for f in PY_FILES:          #add dependencies
    sc.addPyFile('%s://%s/%s' %(FILESYSTEM, APP_HOME, f))

for f in FILES:          #add required files
    sc.addFile('%s://%s/%s' %(FILESYSTEM, APP_HOME, f))
    
pymongo_spark.activate()

开发者ID:ayat-rashad，项目名称:eg_twitter，代码行数:32，代码来源:settings.py

示例8: predict

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
def predict(row_coord,cSize,model):
	vector_dict={}
	for w in row_coord[1]:
		vector_dict[int(w[1])]=w[0]
	return (row_coord[0], model.value.predict(SparseVector(cSize.value,vector_dict)))

trainF="./data/train" #the path to where the train data is
testF="./data/test" # the path to the unlabelled data 
saveF="./predictions.txt" #where to save the predictions

sc = SparkContext(appName=" \--(o_o)--/ ")  #initialize the spark context


#since we are not in the command line interface we need to add to the spark context
#some of our classes so that they are available to the workers
sc.addFile("/home/julien.hamilius/datacamp/code/helpers.py") 
sc.addFile("/home/julien.hamilius/datacamp/code/extract_terms.py")
#now if we import these files they will also be available to the workers
from helpers import *
import extract_terms as et


# load data : data is a list with the text per doc in each cell. Y is the respective class value
#1 :positive , 0 negative
print "loading local data"
data,Y=lf.loadLabeled(trainF) 


print "preprocessing"
pp.proc(data) #clean up the data from  number, html tags, punctuations (except for "?!." ...."?!" are replaced by "."
m = TfidfVectorizer(analyzer=et.terms) # m is a compressed matrix with the tfidf matrix the terms are extracted with our own custom function

开发者ID:jhamilius，项目名称:data-challenge-spark，代码行数:33，代码来源:classify.py

示例9: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
def main():

	config = configparser.ConfigParser()
	config.read('config.ini')

	#Path for Gromacs project
	gromacs_path = preparing_path(config.get('DRUGDESIGN', 'gromacs_path'))
	#Path where PDB ligand are - They are NOT participated in docking
	pdb_ligand_path = config.get('DEFAULT', 'pdb_ligand_path')
	#Path that contains all files for analysis
	path_analysis = config.get('DEFAULT', 'path_analysis')
	#Path where all pdb receptor are
	path_receptor_pdb = config.get('DEFAULT', 'pdb_path')
	#Path for saving pdb files of models generated by VS
	path_analysis_pdb = get_directory_pdb_analysis(path_analysis)

	# Create SPARK config
	maxResultSize = str(config.get('SPARK', 'maxResultSize'))
	conf = (SparkConf().set("spark.driver.maxResultSize", maxResultSize))

	# Create context
	sc = SparkContext(conf=conf)

	#Adding Python Source file
	#Path for drugdesign project
	path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')
	sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"os_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"gromacs_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"pdb_io.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"json_utils.py"))

	#Adding bash scripts
	sc.addFile(os.path.join(path_spark_drugdesign,"make_ndx_buried_area_receptor.sh"))
	sc.addFile(os.path.join(path_spark_drugdesign,"make_ndx_buried_area_receptor_res.sh"))

	#Parameters form command line
	#Indicates probe. Example: 0.14
	#probe = float(sys.argv[1])
	#Indicates ndots. Example: 24
	#ndots = int(sys.argv[2])

	#Broadcast
	path_analysis_pdb_complex_b = sc.broadcast(path_analysis_pdb)
	gromacs_path = sc.broadcast(gromacs_path)
	pdb_ligand_path = sc.broadcast(pdb_ligand_path)
	#probe = sc.broadcast(probe)
	#ndots = sc.broadcast(ndots)

	start_time = datetime.now()

	os.environ["GMX_MAXBACKUP"]="-1"

	#Loading all PDB receptor files into memory
	list_all_pdb_receptor_files_path = []
	all_receptor_for_complex = get_files_pdb(path_receptor_pdb)
	for receptor in all_receptor_for_complex:
		list_all_pdb_receptor_files_path.append(loading_pdb_2_list(receptor))

	for pdb_receptor_files in list_all_pdb_receptor_files_path:
		#Getting receptor name by fully path
		base_file_name_receptor = get_name_receptor_pdb(str(pdb_receptor_files[0]))
		#PDB file loaded into memory is sent by broadcast
		pdb_file_receptor = pdb_receptor_files[1]
		pdb_file_receptor = sc.broadcast(pdb_file_receptor)
		#Loading PDB model files based on receptor into memory
		base_file_name_receptor_for_filter = base_file_name_receptor+"_-_"
		all_model_for_complex = get_files_pdb_filter(path_analysis_pdb,base_file_name_receptor_for_filter)
		all_model_for_complexRDD = sc.parallelize(all_model_for_complex)
		all_model_filesRDD = all_model_for_complexRDD.map(loading_pdb_2_list).collect()

# ********** Starting function **********************************************************
		def save_model_receptor(list_receptor_model_file):
			receptor_file = pdb_file_receptor.value #Obtained from broadcast
			model_file = list_receptor_model_file[0]
			full_path_for_save_complex = list_receptor_model_file[1]
			#Open file for writting the complex
			f_compl = open(full_path_for_save_complex, "w")
			#Insert lines of receptor
			for item in  receptor_file:
				f_compl.write(item)
			#Insert lines of model and insert Z chain
			for item in model_file:
				item = replace_chain_atom_line(item,"d","z")
				f_compl.write(item)
			f_compl.close()
# ********** Finish function **********************************************************

# ********** Starting function **********************************************************
		def compute_buried_area_all_residues_and_receptor_area(pdb_complex):
			chZ = "chZ"
			res_buried_area_perc = -1
			res_buried_area = -1
			buried_receptor_system = -1
			buried_receptor_res = -1
			base_name = get_name_model_pdb(pdb_complex)
			ligand_name = get_ligand_from_receptor_ligand_model(base_name)
			receptor_name = get_receptor_from_receptor_ligand_model(base_name)
			pose = get_model_from_receptor_ligand_model(base_name)

#.........这里部分代码省略.........

开发者ID:rodrigofaccioli，项目名称:drugdesign，代码行数:103，代码来源:buried_area_receptor.py

示例10: SparkContext

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
sys.path.append(SPARK_HOME_PYTHON)

from pyspark import SparkContext
from pyspark import SparkConf

sc = SparkContext(appName = 'topXIp')

#test local speed: only around 75s, much faster
#sc = SparkContext('local' , 'topXIp')

#X = sys.argv[1]

#normal
normalFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv'
normalPath = os.path.join(normalFilePath)
sc.addFile(normalPath);

#attack
attackFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv'
attackPath = os.path.join(attackFilePath)
sc.addFile(attackPath);


from pyspark import SparkFiles
normalRdd = sc.textFile(SparkFiles.get(normalFilePath)).cache()
attackRdd = sc.textFile(SparkFiles.get(attackFilePath)).cache()

# src, dst, data_length, protocol_name, protocol_number, arrival_time (len = 6)
normalRaw = normalRdd.map(lambda x: x.split(',')).filter(lambda x: len(x) == 6).cache()
attackRaw = attackRdd.map(lambda x: x.split(',')).filter(lambda x: len(x) == 6).cache()

开发者ID:jellylidong，项目名称:DeepDefense_dataStatistics，代码行数:32，代码来源:topX.py

示例11: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
def main():
	
	config = configparser.ConfigParser()
	config.read('config.ini')

	#Path for Gromacs project
	gromacs_path = preparing_path(config.get('DRUGDESIGN', 'gromacs_path'))
	#Path where PDB ligand are - They are NOT participated in docking
	pdb_ligand_path = config.get('DEFAULT', 'pdb_ligand_path')
	#Path that contains all files for analysis
	path_analysis = config.get('DEFAULT', 'path_analysis')	
	#Path where all pdb receptor are
	path_receptor_pdb = config.get('DEFAULT', 'pdb_path')	
	#Path for saving pdb files of models generated by VS
	path_analysis_pdb = get_directory_pdb_analysis(path_analysis)
	
	# Create SPARK config
	maxResultSize = str(config.get('SPARK', 'maxResultSize'))
	conf = (SparkConf().set("spark.driver.maxResultSize", maxResultSize))

	# Create context
	sc = SparkContext(conf=conf)

	#Adding Python Source file
	#Path for drugdesign project
	path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')	
	sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"os_util.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"gromacs_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"pdb_io.py"))

	#Adding bash scripts	
	sc.addFile(os.path.join(path_spark_drugdesign,"make_ndx_buried_area_ligand.sh"))	

	#Parameters form command line
	#Indicates probe. Example: 0.14
	probe = float(sys.argv[1])
	#Indicates ndots. Example: 24
	ndots = int(sys.argv[2])

	#Broadcast
	path_analysis_pdb_complex_b = sc.broadcast(path_analysis_pdb)
	gromacs_path = sc.broadcast(gromacs_path)	 
	pdb_ligand_path = sc.broadcast(pdb_ligand_path)
	probe = sc.broadcast(probe)
	ndots = sc.broadcast(ndots)

	start_time = datetime.now()

	os.environ["GMX_MAXBACKUP"]="-1"

	#Loading all PDB receptor files into memory
	list_all_pdb_receptor_files_path = []
	all_receptor_for_complex = get_files_pdb(path_receptor_pdb)
	for receptor in all_receptor_for_complex:
		list_all_pdb_receptor_files_path.append(loading_pdb_2_list(receptor))

	for pdb_receptor_files in list_all_pdb_receptor_files_path:
		#Getting receptor name by fully path
		base_file_name_receptor = get_name_receptor_pdb(str(pdb_receptor_files[0]))
		#PDB file loaded into memory is sent by broadcast
		pdb_file_receptor = pdb_receptor_files[1]
		pdb_file_receptor = sc.broadcast(pdb_file_receptor)
		#Loading PDB model files based on receptor into memory
		base_file_name_receptor_for_filter = base_file_name_receptor+"_-_"
		all_model_for_complex = get_files_pdb_filter(path_analysis_pdb,base_file_name_receptor_for_filter)
		all_model_for_complexRDD = sc.parallelize(all_model_for_complex)
		all_model_filesRDD = all_model_for_complexRDD.map(loading_pdb_2_list).collect()

# ********** Starting function **********************************************************		
		def save_model_receptor(list_receptor_model_file):
			receptor_file = pdb_file_receptor.value #Obtained from broadcast
			model_file = list_receptor_model_file[0]			
			full_path_for_save_complex = list_receptor_model_file[1]
			#Open file for writting the complex
			f_compl = open(full_path_for_save_complex, "w")
			#Insert lines of receptor
			for item in  receptor_file:
				f_compl.write(item)
			#Insert lines of model and insert Z chain
			for item in model_file:
				item = replace_chain_atom_line(item,"d","z")
				f_compl.write(item)
			f_compl.close()
# ********** Finish function **********************************************************					

# ********** Starting function **********************************************************		
		def compute_buried_area_ligand(pdb_complex):
			chZ = "chZ"
			buried_lig_rec_perc = -1.0
			buried_lig_rec = -1.0
			buried_lig_lig = -1.0
			buried_lig_lig_perc = -1.0
			base_name = get_name_model_pdb(pdb_complex)		
			ligand_name = get_ligand_from_receptor_ligand_model(base_name)
			receptor_name = get_receptor_from_receptor_ligand_model(base_name)
			pose = get_model_from_receptor_ligand_model(base_name)						
			pdb_before_vs = os.path.join(pdb_ligand_path.value,ligand_name+".pdb")			
			#ndx files					
			f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx")			
#.........这里部分代码省略.........

开发者ID:Tidebringer，项目名称:drugdesign，代码行数:103，代码来源:buried_area_ligand.py

示例12: main

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
def main():
    conf = (SparkConf()
                .setMaster("local[*]")
                .setAppName("compare_engine"))
                
    sc = SparkContext(conf = conf)
    sc.setLogLevel('INFO')

    sc.addFile(primary)

    # rdd_primary = sc.textFile(primary, minPartitions=4, use_unicode=True).distinct() 
    rdd_primary = sc.textFile(SparkFiles.get(primary), minPartitions=4, use_unicode=True).distinct() 
    rdd_primary.partitionBy(10).cache()

    os.system('rm -Rf collects_*')
    os.system('rm -Rf holder.txt')
       
    rdd_secondary = sc.textFile(secondary, minPartitions=4, use_unicode=True).distinct()
    rdd_secondary.partitionBy(10).cache()

    primary_count = rdd_primary.count()
    primary_report['count'] = primary_count
    print(primary_report)

    secondary_count = rdd_secondary.count()
    secondary_report['count'] = secondary_count
    print(secondary_report)

    # Return each Primary file line/record not contained in Secondary
    not_in_primary  = rdd_primary.subtract(rdd_secondary)
    primary_diff = not_in_primary.count()
    primary_report['diff'] = primary_diff
    
    os.system('rm -Rf collects_*.csv')

    primary_dir = 'collects_{}_primary'.format(run_date)
    primary_report_name = 'collects_{}_primary_report.csv'.format(run_date)

    not_in_primary.coalesce(1, True).saveAsTextFile(primary_dir)

    # os.system('cat collects_{}_primary/part-0000* >> collects_{}_primary_report.csv'.format(run_date, run_date))
    os.system('cat {}/part-0000* >> {}'.format(primary_dir, primary_report_name))
    os.system('wc -l collects_{}_primary_report.csv'.format(run_date))

    # Flip Primary Vs Secondary
    # Return each Secondary file line/record not contained in Primary
    not_in_secondary  = rdd_secondary.subtract(rdd_primary)
    secondary_diff = not_in_secondary.count()
    secondary_report['diff'] = secondary_diff

    not_in_secondary.coalesce(1,True).saveAsTextFile('collects_{}_secondary'.format(run_date))
    os.system('cat collects_{}_secondary/part-0000* >> collects_{}_secondary_report.csv'.format(run_date, run_date))
    os.system('wc -l collects_{}_secondary_report.csv'.format(run_date))

    process_report['primary'] = primary_report
    process_report['secondary'] =  secondary_report

    print("=" * 100)
    print('\n')
    print(process_report)
    print('\n')
    print("=" * 100)
    spark_details(sc)
    

    sc.stop()

开发者ID:searchs，项目名称:pybox，代码行数:68，代码来源:compare_all.py

示例13: SparkConf

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
# Dummy Spark App demo
from pyspark import SparkContext, SparkConf
from pyspark import SparkFiles

import numpy as np
from barista.customer import Customer

conf = SparkConf().setAppName("Dummy Demo")
sc = SparkContext(conf=conf)

# Add prototxt files to Spark Context
sc.addFile("models/solver.prototxt")
sc.addFile("models/train_val.prototxt")

# Add barista module
sc.addPyFile("barista.zip")
sc.addPyFile("barista/start.py")


# Subclass generic barista Customer
class MyCustomer(Customer):
    def __init__(self, filename):
        compute_semaphore, model_semaphore, handles = \
            Customer.parse_ipc_interface_file(filename)
        Customer.__init__(self, compute_semaphore, model_semaphore, handles)

    def update_data(self):
        self.arrays['data'][:] = np.random.randn(*self.arrays['data'].shape)
        self.arrays['label'][:] = np.random.choice(
                                      xrange(10),
                                      size=self.arrays['label'].shape)

开发者ID:kjchavez，项目名称:sparkcaffe，代码行数:33，代码来源:dummy-demo.py

示例14: SparkContext

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
    spark_config.setExecutorEnv('AWS_ACCESS_KEY_ID', config.access_key)
    spark_config.setExecutorEnv('AWS_SECRET_ACCESS_KEY ', config.secret_access_key)


# Build up the context, using the master URL
sc = SparkContext('spark://ulex:7077', 'mean', conf=spark_config)
local_data_path = '/media/bitbucket/pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc'
data_path = local_data_path
data_url = 'https://nasanex.s3.amazonaws.com/NEX-DCP30/BCSD/rcp26/mon/atmos/pr/r1i1p1/v1.0/CONUS/pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc'

if download:
    data_path = data_url

# Download the file onto each node
if download or config.copy_local:
    sc.addFile(data_path)

# Still need to open dataset on master node to get number of timesteps. For
# some reason the master node doesn't seem to be able to access the downloaded
# version, this may be a bug in addFile(...)
data = Dataset(local_data_path)
pr = data.variables['pr']

# Get the number of timesteps
num_timesteps = data.variables['time'].size

data.close()

# Now partition timesteps across the cluster
timesteps = sc.parallelize(range(0, num_timesteps), 30)

开发者ID:OpenGeoscience，项目名称:nex，代码行数:32，代码来源:timestep_mean.py

示例15: SparkContext

# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import addFile [as 别名]
limit = 50

if __name__ == "__main__":
    sc = SparkContext(appName="MTurk")
    inputFilename = sys.argv[1]
    outputDirectory = sys.argv[2]
    featureListFilename = sys.argv[3]
    crfModelFilename = sys.argv[4]
    eyeRef = sys.argv[5]
    eyeConfig = sys.argv[6]
    hairRef = sys.argv[7]
    hairConfig = sys.argv[8]
    # Program to compute CRF++
    c = crf_features.CrfFeatures(featureListFilename)
    # Add files to be downloaded with this Spark job on every node.
    sc.addFile("/usr/local/bin/crf_test")
    sc.addFile(crfModelFilename)

    # Map to reference sets
    smEye = HybridJaccard(ref_path=eyeRef, config_path=eyeConfig)
    smHair = HybridJaccard(ref_path=hairRef, config_path=hairConfig)

    rdd = sc.sequenceFile(inputFilename)
    if limit:
        rdd = sc.parallelize(rdd.take(limit))

    rdd_json = rdd.mapValues(lambda x: json.loads(x))

    rdd_body = rdd_json.mapValues(lambda x: extract_body(x))
    rdd_body_tokens = rdd_body.mapValues(lambda x: textTokens(x))

开发者ID:cjsanjay，项目名称:dig-crf，代码行数:32，代码来源:driver-old.py

注：本文中的pyspark.SparkContext.addFile方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。