本文整理汇总了Python中pyspark.mllib.linalg.Vectors.sparse方法的典型用法代码示例。如果您正苦于以下问题:Python Vectors.sparse方法的具体用法?Python Vectors.sparse怎么用?Python Vectors.sparse使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.linalg.Vectors
的用法示例。
在下文中一共展示了Vectors.sparse方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_append_bias_with_sp_vector
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import sparse [as 别名]
def test_append_bias_with_sp_vector(self):
data = Vectors.sparse(3, {0: 2.0, 2: 2.0})
expected = Vectors.sparse(4, {0: 2.0, 2: 2.0, 3: 1.0})
# Returned value must be SparseVector
ret = MLUtils.appendBias(data)
self.assertEqual(ret, expected)
self.assertEqual(type(ret), SparseVector)
示例2: test_right_number_of_results
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import sparse [as 别名]
def test_right_number_of_results(self):
num_cols = 1001
sparse_data = [
LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
]
chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
self.assertEqual(len(chi), num_cols)
self.assertIsNotNone(chi[1000])
示例3: parseEntry
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import sparse [as 别名]
def parseEntry(xx):
mindate=datetime.datetime(datetime.MINYEAR, 1, 1,1,1)
xx=xx.split('\t')
a_virtual=xx[0]
browser=xx[1]
referrer=xx[2]
a_user_key=xx[3]
try:
birthyear=int(xx[4])
age=2015-birthyear
except Exception as _:
birthyear=xx[4]
age=-1
gender=xx[5]
#print(xx)
#print(xx[6])
if xx[6]!='NAN':
reg_date=datetime.datetime.strptime(xx[6],'%Y-%m-%d')
else:
reg_date=mindate
device=xx[7]
date=datetime.datetime.strptime(xx[8],'%d-%m-%Y')
tdiff=datetime.timedelta(hours=int(xx[9]))
date=date+tdiff
year=date.year
month=date.month
day=date.day
hour=int(xx[9])
weekday=date.weekday()
if reg_date>mindate:
days_since_registration=(date-reg_date).days
else:
days_since_registration=-1
metrics=list([int(x.replace(',0','')) for x in xx[10:]])
visits=metrics[0]
visits_betalt=metrics[1]
pageviews=metrics[2]
pageview_nothome=metrics[3]
pageview_betalt=metrics[4]
timegroup_pvs=Vectors.sparse(maxInd,[(intervalIndDict[(weekday,hour)],pageviews)])
timegroup_visit=Vectors.sparse(maxInd,[(intervalIndDict[(weekday,hour)],1.)])
return Row(browser=browser,a_user_key=a_user_key,age=age,\
day=day,hour=hour,date=date,weekday=weekday,pv=pageviews,\
pv_nh=pageview_nothome,pv_bet=pageview_betalt,referrer=referrer,\
device=device,gender=gender,days_since_registration=days_since_registration,\
reg_date=reg_date,timegroup_pvs=timegroup_pvs,timegroup_visit=timegroup_visit,\
a_virtual=a_virtual)
示例4: ztest_toPandas
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import sparse [as 别名]
def ztest_toPandas(self):
data = [(Vectors.dense([0.1, 0.2]),),
(Vectors.sparse(2, {0:0.3, 1:0.4}),),
(Vectors.sparse(2, {0:0.5, 1:0.6}),)]
df = self.sql.createDataFrame(data, ["features"])
self.assertEqual(df.count(), 3)
pd = self.converter.toPandas(df)
self.assertEqual(len(pd), 3)
self.assertTrue(isinstance(pd.features[0], csr_matrix),
"Expected pd.features[0] to be csr_matrix but found: %s" %
type(pd.features[0]))
self.assertEqual(pd.features[0].shape[0], 3)
self.assertEqual(pd.features[0].shape[1], 2)
self.assertEqual(pd.features[0][0,0], 0.1)
self.assertEqual(pd.features[0][0,1], 0.2)
示例5: test_persistence
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import sparse [as 别名]
def test_persistence(self):
# Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
sqlContext = SQLContext(self.sc)
df = sqlContext.createDataFrame([
[1, Vectors.dense([0.0, 1.0])],
[2, Vectors.sparse(2, {0: 1.0})],
], ["id", "features"])
# Fit model
lda = LDA(k=2, seed=1, optimizer="em")
distributedModel = lda.fit(df)
self.assertTrue(distributedModel.isDistributed())
localModel = distributedModel.toLocal()
self.assertFalse(localModel.isDistributed())
# Define paths
path = tempfile.mkdtemp()
lda_path = path + "/lda"
dist_model_path = path + "/distLDAModel"
local_model_path = path + "/localLDAModel"
# Test LDA
lda.save(lda_path)
lda2 = LDA.load(lda_path)
self._compare(lda, lda2)
# Test DistributedLDAModel
distributedModel.save(dist_model_path)
distributedModel2 = DistributedLDAModel.load(dist_model_path)
self._compare(distributedModel, distributedModel2)
# Test LocalLDAModel
localModel.save(local_model_path)
localModel2 = LocalLDAModel.load(local_model_path)
self._compare(localModel, localModel2)
# Clean up
try:
rmtree(path)
except OSError:
pass
示例6: add_svec
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import sparse [as 别名]
def add_svec(sv1, sv2):
assert len(sv1) == len(sv2), "dimension mismatch"
indices = []
values = []
i, j = 0, 0
while i < len(sv1.indices) and j < len(sv2.indices):
if sv1.indices[i] == sv2.indices[j]:
indices.append(sv1.indices[i])
values.append(sv1.values[i] + sv2.values[j])
i += 1
j += 1
elif sv1.indices[i] < sv2.indices[j]:
indices.append(sv1.indices[i])
values.append(sv1.values[i])
i += 1
else:
indices.append(sv2.indices[j])
values.append(sv2.values[j])
j += 1
while i < len(sv1.indices):
indices.append(sv1.indices[i])
values.append(sv1.values[i])
i += 1
while j < len(sv2.indices):
indices.append(sv2.indices[j])
values.append(sv2.values[j])
j += 1
return Vectors.sparse(len(sv1), indices, values)
示例7: test_glr_summary
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import sparse [as 别名]
def test_glr_summary(self):
from pyspark.mllib.linalg import Vectors
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight",
fitIntercept=False)
model = glr.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
self.assertEqual(s.numIterations, 1) # this should default to a single iteration of WLS
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.predictionCol, "prediction")
self.assertTrue(isinstance(s.residuals(), DataFrame))
self.assertTrue(isinstance(s.residuals("pearson"), DataFrame))
coefStdErr = s.coefficientStandardErrors
self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
tValues = s.tValues
self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
pValues = s.pValues
self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
self.assertEqual(s.degreesOfFreedom, 1)
self.assertEqual(s.residualDegreeOfFreedom, 1)
self.assertEqual(s.residualDegreeOfFreedomNull, 2)
self.assertEqual(s.rank, 1)
self.assertTrue(isinstance(s.solver, basestring))
self.assertTrue(isinstance(s.aic, float))
self.assertTrue(isinstance(s.deviance, float))
self.assertTrue(isinstance(s.nullDeviance, float))
self.assertTrue(isinstance(s.dispersion, float))
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned, Scala version runs full test
sameSummary = model.evaluate(df)
self.assertAlmostEqual(sameSummary.deviance, s.deviance)
示例8: scoreOnePoint
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import sparse [as 别名]
def scoreOnePoint(self, x):
"""
Compute the log likelihood of 'x' being generated under the current model
Also returns the probability that 'x' is generated by each component of the mixture
Parameters
----------
x : array of shape (1, n_dim)
Corresponds to a single data point.
Returns
-------
log_likelihood_x :Log likelihood of 'x'
prob_x : Resposibility of each cluster for the data point 'x'
"""
lpr = (self.log_multivariate_normal_density_diag_Nd(x) + np.log(self.Weights))
log_likelihood_x = logsumexp(lpr)
prob_x = np.exp(lpr-log_likelihood_x)
if self.isSparse == 1:
temp_wt = np.dot(prob_x[:, np.newaxis], x.toArray()[np.newaxis, :])
sqVec = Vectors.sparse(x.size, x.indices, x.values**2)
temp_avg = np.dot(prob_x.T[:, np.newaxis], sqVec.toArray()[np.newaxis, :])
else:
temp_wt = np.dot(prob_x.T[:, np.newaxis], x[np.newaxis, :])
temp_avg = np.dot(prob_x.T[:, np.newaxis], (x*x)[np.newaxis, :])
return log_likelihood_x, prob_x, temp_wt, temp_avg
示例9: createSparseVector
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import sparse [as 别名]
def createSparseVector(histogram):
indexList = []
countList = []
for histogramIndex, count in sorted(histogram, key=getKey):
indexList.append(histogramIndex)
countList.append(count)
return Vectors.sparse(2000, indexList,countList)
示例10: load_cut_to_rdd
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import sparse [as 别名]
def load_cut_to_rdd(input_file, result_file):
sc = SparkContext(appName='PythonKMeans',master="mesos://219.224.135.91:5050")
lines = sc.textFile(input_file)
data = lines.map(parseKV).cache()
doc_term_tf = data.reduceByKey(add).cache()
num_doc = doc_term_tf.map(lambda ((tid, term), tf): tid).distinct().count()
terms_list = doc_term_tf.map(lambda ((tid, term), tf): term).distinct().collect()
num_term = len(terms_list)
term_idf = doc_term_tf.map(
lambda ((tid, term), tf): (term, 1.0)
).reduceByKey(add).mapValues(lambda idf: math.log(float(num_doc) / (idf+1)))
tfidf_join = doc_term_tf.map(
lambda ((tid, term), tf): (term, (tid, tf))).join(term_idf)
tfidf = tfidf_join.map(lambda (term, ((tid, tf), idf)): (tid, (terms_list.index(term), tf*idf)))
doc_vec = tfidf.groupByKey().mapValues(lambda feature : Vectors.sparse(num_term, feature).toArray()).cache()
nonzero_count = 0
f = open(result_file,'w')
f.write('%s %s\r\n'%(num_doc, num_term))
for (tid, feature) in doc_vec.collect():
for num in feature:
f.write(str(num)+"\t")
f.write("\n")
f.close()
sc.stop()
return
示例11: test_logistic_regression_summary
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import sparse [as 别名]
def test_logistic_regression_summary(self):
from pyspark.mllib.linalg import Vectors
sqlContext = SQLContext(self.sc)
df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
model = lr.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.probabilityCol, "probability")
self.assertEqual(s.labelCol, "label")
self.assertEqual(s.featuresCol, "features")
objHist = s.objectiveHistory
self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
self.assertGreater(s.totalIterations, 0)
self.assertTrue(isinstance(s.roc, DataFrame))
self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
self.assertTrue(isinstance(s.pr, DataFrame))
self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned, Scala version runs full test
sameSummary = model.evaluate(df)
self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
示例12: loadLibSVMFile
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import sparse [as 别名]
def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None, multiclass=None):
"""
Loads labeled data in the LIBSVM format into an RDD of
LabeledPoint. The LIBSVM format is a text-based format used by
LIBSVM and LIBLINEAR. Each line represents a labeled sparse
feature vector using the following format:
label index1:value1 index2:value2 ...
where the indices are one-based and in ascending order. This
method parses each line into a LabeledPoint, where the feature
indices are converted to zero-based.
:param sc: Spark context
:param path: file or directory path in any Hadoop-supported file
system URI
:param numFeatures: number of features, which will be determined
from the input data if a nonpositive value
is given. This is useful when the dataset is
already split into multiple files and you
want to load them separately, because some
features may not present in certain files,
which leads to inconsistent feature
dimensions.
:param minPartitions: min number of partitions
@return: labeled data stored as an RDD of LabeledPoint
>>> from tempfile import NamedTemporaryFile
>>> from pyspark.mllib.util import MLUtils
>>> from pyspark.mllib.regression import LabeledPoint
>>> tempFile = NamedTemporaryFile(delete=True)
>>> tempFile.write("+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
>>> tempFile.flush()
>>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
>>> tempFile.close()
>>> type(examples[0]) == LabeledPoint
True
>>> print examples[0]
(1.0,(6,[0,2,4],[1.0,2.0,3.0]))
>>> type(examples[1]) == LabeledPoint
True
>>> print examples[1]
(-1.0,(6,[],[]))
>>> type(examples[2]) == LabeledPoint
True
>>> print examples[2]
(-1.0,(6,[1,3,5],[4.0,5.0,6.0]))
"""
from pyspark.mllib.regression import LabeledPoint
if multiclass is not None:
warnings.warn("deprecated", DeprecationWarning)
lines = sc.textFile(path, minPartitions)
parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
if numFeatures <= 0:
parsed.cache()
numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))
示例13: _get_data
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import sparse [as 别名]
def _get_data(self):
sql_context = SQLContext(self.sc)
l = [
(
"I dont know why people think this is such a bad movie.",
Vectors.sparse(3, {1: 1.0, 2: 1.0, 3: 1.0})
),
]
return sql_context.createDataFrame(l, ['text', 'features'])
示例14: test_model_transform
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import sparse [as 别名]
def test_model_transform(self):
weight = Vectors.dense([3, 2, 1])
densevec = Vectors.dense([4, 5, 6])
sparsevec = Vectors.sparse(3, [0], [1])
eprod = ElementwiseProduct(weight)
self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6]))
self.assertEqual(
eprod.transform(sparsevec), SparseVector(3, [0], [3]))
示例15: mkFeatureVector
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import sparse [as 别名]
def mkFeatureVector(idxSizeArr):
tempSize = 0
featureArr = []
valueArr = []
for i in idxSizeArr:
featureArr.append(i[0] + tempSize)
valueArr.append(1)
tempSize += i[1]
return Vectors.sparse(tempSize, featureArr, valueArr)