本文整理汇总了Python中pyspark.SparkContext方法的典型用法代码示例。如果您正苦于以下问题:Python pyspark.SparkContext方法的具体用法?Python pyspark.SparkContext怎么用?Python pyspark.SparkContext使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark
的用法示例。
在下文中一共展示了pyspark.SparkContext方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkContext [as 别名]
def main():
# Adapted from https://github.com/apache/spark/tree/master/examples/src/main/python/streaming
sc = SparkContext(appName='PythonStreamingQueue')
ssc = StreamingContext(sc, 1)
# Create the queue through which RDDs can be pushed to
# a QueueInputDStream
rddQueue = []
for _ in range(5):
rddQueue += [ssc.sparkContext.parallelize([j for j in range(1, 1001)], 10)]
# Create the QueueInputDStream and use it do some processing
inputStream = ssc.queueStream(rddQueue)
mappedStream = inputStream.map(lambda x: (x % 10, 1))
reducedStream = mappedStream.reduceByKey(lambda a, b: a + b)
reducedStream.pprint()
ssc.start()
time.sleep(6)
ssc.stop(stopSparkContext=True, stopGraceFully=True)
示例2: bluecoat_parse
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkContext [as 别名]
def bluecoat_parse(zk, topic, db, db_table, num_of_workers, batch_size):
"""
Parse and save bluecoat logs.
:param zk: Apache ZooKeeper quorum
:param topic: Apache Kafka topic (application name)
:param db: Apache Hive database to save into
:param db_table: table of `db` to save into
:param num_of_workers: number of Apache Kafka workers
:param batch_size: batch size for Apache Spark streaming context
"""
app_name = topic
wrks = int(num_of_workers)
# create spark context
sc = SparkContext(appName=app_name)
ssc = StreamingContext(sc, int(batch_size))
sqc = HiveContext(sc)
tp_stream = KafkaUtils.createStream(ssc, zk, app_name, {topic: wrks}, keyDecoder=spot_decoder, valueDecoder=spot_decoder)
proxy_data = tp_stream.map(lambda row: row[1]).flatMap(lambda row: row.split("\n")).filter(lambda row: rex_date.match(row)).map(lambda row: row.strip("\n").strip("\r").replace("\t", " ").replace(" ", " ")).map(lambda row: split_log_entry(row)).map(lambda row: proxy_parser(row))
saved_data = proxy_data.foreachRDD(lambda row: save_data(row, sqc, db, db_table, topic))
ssc.start()
ssc.awaitTermination()
示例3: run
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkContext [as 别名]
def run():
from pyspark import SparkContext, SparkConf
conf = SparkConf()
conf.setAppName('dispel4py')
conf.set("spark.storage.memoryFraction", "0.5")
sc = SparkContext(
conf=conf)
from dispel4py.new import processor
from dispel4py.utils import load_graph
args = parse_args()
graph = load_graph(args.module, args.attr)
if graph is None:
return
graph.flatten()
inputs = processor.create_inputs(args, graph)
process(sc, graph, inputs=inputs, args=args)
示例4: create_sc
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkContext [as 别名]
def create_sc():
sc_conf = SparkConf()
sc_conf.setAppName("finance-similarity-app")
sc_conf.setMaster('spark://10.21.208.21:7077')
sc_conf.set('spark.executor.memory', '2g')
sc_conf.set('spark.executor.cores', '4')
sc_conf.set('spark.cores.max', '40')
sc_conf.set('spark.logConf', True)
print sc_conf.getAll()
sc = None
try:
sc.stop()
sc = SparkContext(conf=sc_conf)
except:
sc = SparkContext(conf=sc_conf)
return sc
示例5: main
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkContext [as 别名]
def main(data_path, output_path):
# Read data
logging.info(f"Reading data from {data_path}")
sc = SparkContext()
sql = SQLContext(sc)
data = sql.read.parquet(data_path)
# Build label matrix
logging.info("Applying LFs")
lfs = [article_mentions_person, body_contains_fortune, person_in_db]
applier = SparkLFApplier(lfs)
L = applier.apply(data.rdd)
# Train label model
logging.info("Training label model")
label_model = LabelModel(cardinality=2)
label_model.fit(L)
# Generate training labels
logging.info("Generating probabilistic labels")
y_prob = label_model.predict_proba(L)[:, 1]
y_prob_sql_array = F.array([F.lit(y) for y in y_prob])
data_labeled = data.withColumn("y_prob", y_prob_sql_array)
data_labeled.write.mode("overwrite").parquet(output_path)
logging.info(f"Labels saved to {output_path}")
示例6: main
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkContext [as 别名]
def main(args):
window_size = 600
files = filecollector.collect(args.input_path)
sc = SparkContext("local", "sparkline")
pipeline = (
sc.parallelize(files, 4)
.map(lambda f: read_wav(f))
.flatMap(lambda (f, signal, samplerate): sliding_audio(f, signal, samplerate))
.map(lambda (f, signal, samplerate): downsample(f, signal, samplerate))
.map(lambda (f, signal, samplerate): apply_melfilter(f, signal, samplerate))
.map(lambda (f, image): (f, graphic.colormapping.to_grayscale(image, bytes=True)))
.map(lambda (f, image): (f, graphic.histeq.histeq(image)))
.map(lambda (f, image): (f, graphic.histeq.clamp_and_equalize(image)))
.map(lambda (f, image): (f, graphic.windowing.cut_or_pad_window(image, window_size)))
.map(lambda (f, image): output.image.save(f, image, args.output_path))
)
pipeline.collect()
#.map(lambda (f, signal, samplerate): generate_spectrograms(f, signal, samplerate))
示例7: sql_context
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkContext [as 别名]
def sql_context(self, application_name):
"""Create a spark context given the parameters configured in this class.
The caller is responsible for calling ``.close`` on the resulting spark context
Parameters
----------
application_name : string
Returns
-------
sc : SparkContext
"""
sc = self.spark_context(application_name)
import pyspark
sqlContext = pyspark.SQLContext(sc)
return (sc, sqlContext)
示例8: _py2java
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkContext [as 别名]
def _py2java(sc, obj):
""" Convert Python object into Java """
if isinstance(obj, RDD):
obj = _to_java_object_rdd(obj)
elif isinstance(obj, DataFrame):
obj = obj._jdf
elif isinstance(obj, SparkContext):
obj = obj._jsc
elif isinstance(obj, list):
obj = [_py2java(sc, x) for x in obj]
elif isinstance(obj, JavaObject):
pass
elif isinstance(obj, (int, long, float, bool, bytes, unicode)):
pass
else:
data = bytearray(PickleSerializer().dumps(obj))
obj = sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(data)
return obj
示例9: main
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkContext [as 别名]
def main():
if len(sys.argv) != 3:
print >> sys.stderr, "Usage: example <keyspace_name> <column_family_name>"
sys.exit(-1)
keyspace_name = sys.argv[1]
column_family_name = sys.argv[2]
# Valid config options here https://github.com/datastax/spark-cassandra-connector/blob/master/doc/1_connecting.md
conf = SparkConf().set("spark.cassandra.connection.host", "127.0.0.1")
sc = SparkContext(appName="Spark + Cassandra Example",
conf=conf)
# import time; time.sleep(30)
java_import(sc._gateway.jvm, "com.datastax.spark.connector.CassandraJavaUtil")
print sc._jvm.CassandraJavaUtil
users = (
["Mike", "Sukmanowsky"],
["Andrew", "Montalenti"],
["Keith", "Bourgoin"],
)
rdd = sc.parallelize(users)
print rdd.collect()
示例10: sparkSession
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkContext [as 别名]
def sparkSession(cls):
if not hasattr(cls, "spark"):
# We can't use the SparkSession Builder here, since we need to call
# Scala side's SmvTestHive.createContext to create the HiveTestContext's
# SparkSession.
# So we need to
# * Create a java_gateway
# * Create a SparkConf using the jgw (since without it SparkContext will ignore the given conf)
# * Create python SparkContext using the SparkConf (so we can specify the warehouse.dir)
# * Create Scala side HiveTestContext SparkSession
# * Create python SparkSession
jgw = launch_gateway(None)
jvm = jgw.jvm
import tempfile
import getpass
hivedir = "file://{0}/{1}/smv_hive_test".format(tempfile.gettempdir(), getpass.getuser())
sConf = SparkConf(False, _jvm=jvm).set("spark.sql.test", "")\
.set("spark.sql.hive.metastore.barrierPrefixes",
"org.apache.spark.sql.hive.execution.PairSerDe")\
.set("spark.sql.warehouse.dir", hivedir)\
.set("spark.ui.enabled", "false")
sc = SparkContext(master="local[1]", appName="SMV Python Test", conf=sConf, gateway=jgw).getOrCreate()
jss = sc._jvm.org.apache.spark.sql.hive.test.SmvTestHive.createContext(sc._jsc.sc())
cls.spark = SparkSession(sc, jss.sparkSession())
return cls.spark
示例11: __call__
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkContext [as 别名]
def __call__(self):
c = SparkConf().setAppName('Build %s' % self.model_name)
log.info('Using spark master: %s', c.get('spark.master'))
sc = SparkContext(conf=c)
kwargs = self.model.prepare(sc)
m = self.model.build(**kwargs)
m = self.model.format_items(m)
m = self.formatter(m)
if self.output_path:
log.info("Saving to: %s", self.output_path)
if os.path.isdir(self.output_path):
log.warn('Writing over output path: %s', self.output_path)
shutil.rmtree(self.output_path)
m.saveAsTextFile(self.output_path, 'org.apache.hadoop.io.compress.GzipCodec')
elif self.sample > 0:
print '\n'.join(str(i) for i in m.take(self.sample))
log.info('Done.')
示例12: _spark_session
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkContext [as 别名]
def _spark_session():
"""Internal fixture for SparkSession instance.
Yields SparkSession instance if it is supported by the pyspark
version, otherwise yields None.
Required to correctly initialize `spark_context` fixture after
`spark_session` fixture.
..note::
It is not possible to create SparkSession from the existing
SparkContext.
"""
try:
from pyspark.sql import SparkSession
except ImportError:
yield
else:
session = SparkSession.builder \
.config(conf=SparkConfigBuilder().get()) \
.getOrCreate()
yield session
session.stop()
示例13: spark_context
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkContext [as 别名]
def spark_context(_spark_session):
"""Return a SparkContext instance with reduced logging
(session scope).
"""
if _spark_session is None:
from pyspark import SparkContext
# pyspark 1.x: create SparkContext instance
sc = SparkContext(conf=SparkConfigBuilder().get())
else:
# pyspark 2.x: get SparkContext from SparkSession fixture
sc = _spark_session.sparkContext
reduce_logging(sc)
yield sc
if _spark_session is None:
sc.stop() # pyspark 1.x: stop SparkContext instance
示例14: sc
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkContext [as 别名]
def sc(request):
""" fixture for creating a spark context
Args:
request: pytest.FixtureRequest object
"""
assert (
request.config.getoption("--spark-master") is not None
), 'No Spark Master Address provided, use --spark-master: "spark://host:port" '
conf = (
SparkConf()
.setMaster(request.config.getoption("--spark-master"))
.setAppName("pytest-pyspark-local-testing")
.set("spark.dynamicAllocation.maxExecutors", 2)
.set("spark.executor.instances", 2)
)
scont = SparkContext(conf=conf)
request.addfinalizer(lambda: scont.stop())
quiet_py4j()
return scont
示例15: run
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkContext [as 别名]
def run(self):
self.args = self.parse_arguments()
conf = SparkConf()
if self.args.spark_profiler:
conf = conf.set("spark.python.profile", "true")
sc = SparkContext(
appName=self.name,
conf=conf)
sqlc = SQLContext(sparkContext=sc)
self.init_accumulators(sc)
self.run_job(sc, sqlc)
if self.args.spark_profiler:
sc.show_profiles()
sc.stop()