本文整理汇总了Python中pyspark.__version__方法的典型用法代码示例。如果您正苦于以下问题:Python pyspark.__version__方法的具体用法?Python pyspark.__version__怎么用?Python pyspark.__version__使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark
的用法示例。
在下文中一共展示了pyspark.__version__方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: assert_pyspark_version
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import __version__ [as 别名]
def assert_pyspark_version():
import logging
pyspark_ver = None
try:
import pyspark
except ImportError:
raise ImportError(
"Unable to import pyspark - consider doing a pip install with [spark] "
"extra to install pyspark with pip"
)
else:
pyspark_ver = getattr(pyspark, "__version__")
if pyspark_ver is None or pyspark_ver < "2.4":
logging.warning(
'Found pyspark version "{}" installed. pyspark>=2.4.0 is recommended.'.format(
pyspark_ver if pyspark_ver is not None else "<unknown version>"
)
)
示例2: test_rfloordiv
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import __version__ [as 别名]
def test_rfloordiv(self):
pdf = pd.DataFrame(
{"angles": [0, 3, 4], "degrees": [360, 180, 360]},
index=["circle", "triangle", "rectangle"],
columns=["angles", "degrees"],
)
kdf = ks.from_pandas(pdf)
if LooseVersion(pd.__version__) < LooseVersion("1.0.0") and LooseVersion(
pd.__version__
) >= LooseVersion("0.24.0"):
expected_result = pd.DataFrame(
{"angles": [np.inf, 3.0, 2.0], "degrees": [0.0, 0.0, 0.0]},
index=["circle", "triangle", "rectangle"],
columns=["angles", "degrees"],
)
else:
expected_result = pdf.rfloordiv(10)
self.assert_eq(kdf.rfloordiv(10), expected_result)
示例3: test_repeat
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import __version__ [as 别名]
def test_repeat(self):
pser = pd.Series(["a", "b", "c"], name="0", index=np.random.rand(3))
kser = ks.from_pandas(pser)
self.assert_eq(kser.repeat(3).sort_index(), pser.repeat(3).sort_index())
self.assert_eq(kser.repeat(0).sort_index(), pser.repeat(0).sort_index())
self.assertRaises(ValueError, lambda: kser.repeat(-1))
self.assertRaises(ValueError, lambda: kser.repeat("abc"))
pdf = pd.DataFrame({"a": ["a", "b", "c"], "rep": [10, 20, 30]}, index=np.random.rand(3))
kdf = ks.from_pandas(pdf)
if LooseVersion(pyspark.__version__) < LooseVersion("2.4"):
self.assertRaises(ValueError, lambda: kdf.a.repeat(kdf.rep))
else:
self.assert_eq(kdf.a.repeat(kdf.rep).sort_index(), pdf.a.repeat(pdf.rep).sort_index())
示例4: test_div_zero_and_nan
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import __version__ [as 别名]
def test_div_zero_and_nan(self):
pser = pd.Series([100, None, -300, None, 500, -700, np.inf, -np.inf], name="Koalas")
kser = ks.from_pandas(pser)
self.assert_eq(repr(pser.div(0)), repr(kser.div(0)))
self.assert_eq(repr(pser.truediv(0)), repr(kser.truediv(0)))
self.assert_eq(repr(pser / 0), repr(kser / 0))
self.assert_eq(repr(pser.div(np.nan)), repr(kser.div(np.nan)))
self.assert_eq(repr(pser.truediv(np.nan)), repr(kser.truediv(np.nan)))
self.assert_eq(repr(pser / np.nan), repr(kser / np.nan))
# floordiv has different behavior in pandas > 1.0.0 when divide by 0
if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"):
self.assert_eq(repr(pser.floordiv(0)), repr(kser.floordiv(0)))
self.assert_eq(repr(pser // 0), repr(kser // 0))
else:
result = pd.Series(
[np.inf, np.nan, -np.inf, np.nan, np.inf, -np.inf, np.inf, -np.inf], name="Koalas"
)
self.assert_eq(repr(kser.floordiv(0)), repr(result))
self.assert_eq(repr(kser // 0), repr(result))
self.assert_eq(repr(pser.floordiv(np.nan)), repr(kser.floordiv(np.nan)))
示例5: apply_async
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import __version__ [as 别名]
def apply_async(self, func, callback=None):
# Note the `func` args is a batch here. (BatchedCalls type)
# See joblib.parallel.Parallel._dispatch
def run_on_worker_and_fetch_result():
# TODO: handle possible spark exception here. # pylint: disable=fixme
rdd = self._spark.sparkContext.parallelize([0], 1) \
.map(lambda _: cloudpickle.dumps(func()))
if VersionUtils.majorMinorVersion(pyspark.__version__)[0] < 3:
ser_res = rdd.collect()[0]
else:
ser_res = rdd.collectWithJobGroup(self._job_group, "joblib spark jobs")[0]
return cloudpickle.loads(ser_res)
return self._get_pool().apply_async(
SafeFunction(run_on_worker_and_fetch_result),
callback=callback
)
示例6: get_default_conda_env
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import __version__ [as 别名]
def get_default_conda_env():
"""
:return: The default Conda environment for MLflow Models produced by calls to
:func:`save_model()` and :func:`log_model()`. This Conda environment
contains the current version of PySpark that is installed on the caller's
system. ``dev`` versions of PySpark are replaced with stable versions in
the resulting Conda environment (e.g., if you are running PySpark version
``2.4.5.dev0``, invoking this method produces a Conda environment with a
dependency on PySpark version ``2.4.5``).
"""
import pyspark
# Strip the suffix from `dev` versions of PySpark, which are not
# available for installation from Anaconda or PyPI
pyspark_version = re.sub(r"(\.?)dev.*", "", pyspark.__version__)
return _mlflow_conda_env(
additional_conda_deps=[
"pyspark={}".format(pyspark_version),
],
additional_pip_deps=None,
additional_conda_channels=None)
示例7: version
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import __version__ [as 别名]
def version(self):
return parse_version(ps.__version__)
示例8: __enter__
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import __version__ [as 别名]
def __enter__(self):
# import locally to avoid importing tensorflow globally.
from petastorm.tf_utils import make_petastorm_dataset
import tensorflow.compat.v1 as tf # pylint: disable=import-error
_wait_file_available(self.parquet_file_url_list)
self.reader = make_batch_reader(self.parquet_file_url_list, **self.petastorm_reader_kwargs)
# unroll dataset
dataset = make_petastorm_dataset(self.reader).flat_map(
tf.data.Dataset.from_tensor_slices)
# TODO: auto tune best batch size in default case.
batch_size = self.batch_size or 32
dataset = dataset.batch(batch_size=batch_size)
prefetch = self.prefetch
if prefetch is None:
if LooseVersion(tf.__version__) >= LooseVersion('1.14'):
# We can make prefetch optimization
prefetch = tf.data.experimental.AUTOTUNE
else:
prefetch = 1
dataset = dataset.prefetch(prefetch)
return dataset
示例9: default_session
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import __version__ [as 别名]
def default_session(conf=None):
if conf is None:
conf = dict()
should_use_legacy_ipc = False
if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") and LooseVersion(
pyspark.__version__
) < LooseVersion("3.0"):
conf["spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT"] = "1"
conf["spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT"] = "1"
conf["spark.mesos.driverEnv.ARROW_PRE_0_15_IPC_FORMAT"] = "1"
conf["spark.kubernetes.driverEnv.ARROW_PRE_0_15_IPC_FORMAT"] = "1"
should_use_legacy_ipc = True
builder = spark.SparkSession.builder.appName("Koalas")
for key, value in conf.items():
builder = builder.config(key, value)
# Currently, Koalas is dependent on such join due to 'compute.ops_on_diff_frames'
# configuration. This is needed with Spark 3.0+.
builder.config("spark.sql.analyzer.failAmbiguousSelfJoin", False)
session = builder.getOrCreate()
if not should_use_legacy_ipc:
is_legacy_ipc_set = any(
v == "1"
for v in [
session.conf.get("spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT", None),
session.conf.get("spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT", None),
session.conf.get("spark.mesos.driverEnv.ARROW_PRE_0_15_IPC_FORMAT", None),
session.conf.get("spark.kubernetes.driverEnv.ARROW_PRE_0_15_IPC_FORMAT", None),
]
)
if is_legacy_ipc_set:
raise RuntimeError(
"Please explicitly unset 'ARROW_PRE_0_15_IPC_FORMAT' environment variable in "
"both driver and executor sides. Check your spark.executorEnv.*, "
"spark.yarn.appMasterEnv.*, spark.mesos.driverEnv.* and "
"spark.kubernetes.driverEnv.* configurations. It is required to set this "
"environment variable only when you use pyarrow>=0.15 and pyspark<3.0."
)
return session
示例10: value_counts
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import __version__ [as 别名]
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True):
if (
LooseVersion(pyspark.__version__) < LooseVersion("2.4")
and default_session().conf.get("spark.sql.execution.arrow.enabled") == "true"
and isinstance(self, MultiIndex)
):
raise RuntimeError(
"if you're using pyspark < 2.4, set conf "
"'spark.sql.execution.arrow.enabled' to 'false' "
"for using this function with MultiIndex"
)
return super(MultiIndex, self).value_counts(
normalize=normalize, sort=sort, ascending=ascending, bins=bins, dropna=dropna
)
示例11: test_udt
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import __version__ [as 别名]
def test_udt(self):
sparse_values = {0: 0.1, 1: 1.1}
sparse_vector = SparseVector(len(sparse_values), sparse_values)
pdf = pd.DataFrame({"a": [sparse_vector], "b": [10]})
if LooseVersion(pyspark.__version__) < LooseVersion("2.4"):
with self.sql_conf({"spark.sql.execution.arrow.enabled": False}):
kdf = ks.from_pandas(pdf)
self.assert_eq(kdf, pdf)
else:
kdf = ks.from_pandas(pdf)
self.assert_eq(kdf, pdf)
示例12: test_to_frame
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import __version__ [as 别名]
def test_to_frame(self):
pidx = self.pdf.index
kidx = self.kdf.index
self.assert_eq(repr(kidx.to_frame()), repr(pidx.to_frame()))
self.assert_eq(repr(kidx.to_frame(index=False)), repr(pidx.to_frame(index=False)))
pidx.name = "a"
kidx.name = "a"
self.assert_eq(repr(kidx.to_frame()), repr(pidx.to_frame()))
self.assert_eq(repr(kidx.to_frame(index=False)), repr(pidx.to_frame(index=False)))
if LooseVersion(pd.__version__) >= LooseVersion("0.24"):
# The `name` argument is added in pandas 0.24.
self.assert_eq(repr(kidx.to_frame(name="x")), repr(pidx.to_frame(name="x")))
self.assert_eq(
repr(kidx.to_frame(index=False, name="x")),
repr(pidx.to_frame(index=False, name="x")),
)
pidx = self.pdf.set_index("b", append=True).index
kidx = self.kdf.set_index("b", append=True).index
self.assert_eq(repr(kidx.to_frame()), repr(pidx.to_frame()))
self.assert_eq(repr(kidx.to_frame(index=False)), repr(pidx.to_frame(index=False)))
if LooseVersion(pd.__version__) >= LooseVersion("0.24"):
# The `name` argument is added in pandas 0.24.
self.assert_eq(
repr(kidx.to_frame(name=["x", "y"])), repr(pidx.to_frame(name=["x", "y"]))
)
self.assert_eq(
repr(kidx.to_frame(index=False, name=["x", "y"])),
repr(pidx.to_frame(index=False, name=["x", "y"])),
)
示例13: test_multi_index_names
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import __version__ [as 别名]
def test_multi_index_names(self):
arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
pdf = pd.DataFrame(np.random.randn(4, 5), idx)
kdf = ks.from_pandas(pdf)
self.assertEqual(kdf.index.names, pdf.index.names)
pidx = pdf.index
kidx = kdf.index
pidx.names = ["renamed_number", "renamed_color"]
kidx.names = ["renamed_number", "renamed_color"]
self.assertEqual(kidx.names, pidx.names)
pidx.names = ["renamed_number", None]
kidx.names = ["renamed_number", None]
self.assertEqual(kidx.names, pidx.names)
if LooseVersion(pyspark.__version__) < LooseVersion("2.4"):
# PySpark < 2.4 does not support struct type with arrow enabled.
with self.sql_conf({"spark.sql.execution.arrow.enabled": False}):
self.assert_eq(kidx, pidx)
else:
self.assert_eq(kidx, pidx)
with self.assertRaises(PandasNotImplementedError):
kidx.name
with self.assertRaises(PandasNotImplementedError):
kidx.name = "renamed"
示例14: test_value_counts
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import __version__ [as 别名]
def test_value_counts(self):
if LooseVersion(pyspark.__version__) < LooseVersion("2.4"):
with self.sql_conf({"spark.sql.execution.arrow.enabled": False}):
self._test_value_counts()
self.assertRaises(
RuntimeError,
lambda: ks.MultiIndex.from_tuples([("x", "a"), ("x", "b")]).value_counts(),
)
else:
self._test_value_counts()
示例15: test_to_list
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import __version__ [as 别名]
def test_to_list(self):
if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"):
self.assertEqual(self.kser.to_list(), self.pser.to_list())