本文整理汇总了Java中org.apache.spark.ml.PipelineModel类的典型用法代码示例。如果您正苦于以下问题:Java PipelineModel类的具体用法?Java PipelineModel怎么用?Java PipelineModel使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
PipelineModel类属于org.apache.spark.ml包,在下文中一共展示了PipelineModel类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: registerFeatures
import org.apache.spark.ml.PipelineModel; //导入依赖的package包/类
@Override
public void registerFeatures(SparkMLEncoder encoder){
RFormulaModel transformer = getTransformer();
ResolvedRFormula resolvedFormula = transformer.resolvedFormula();
String targetCol = resolvedFormula.label();
String labelCol = transformer.getLabelCol();
if(!(targetCol).equals(labelCol)){
List<Feature> features = encoder.getFeatures(targetCol);
encoder.putFeatures(labelCol, features);
}
PipelineModel pipelineModel = transformer.pipelineModel();
Transformer[] stages = pipelineModel.stages();
for(Transformer stage : stages){
FeatureConverter<?> featureConverter = ConverterUtil.createFeatureConverter(stage);
featureConverter.registerFeatures(encoder);
}
}
示例2: toPMMLByteArray
import org.apache.spark.ml.PipelineModel; //导入依赖的package包/类
static
public byte[] toPMMLByteArray(StructType schema, PipelineModel pipelineModel){
PMML pmml = toPMML(schema, pipelineModel);
ByteArrayOutputStream os = new ByteArrayOutputStream(1024 * 1024);
try {
MetroJAXBUtil.marshalPMML(pmml, os);
} catch(JAXBException je){
throw new RuntimeException(je);
}
return os.toByteArray();
}
示例3: run
import org.apache.spark.ml.PipelineModel; //导入依赖的package包/类
private void run() throws Exception {
SparkConf sparkConf = new SparkConf();
try(JavaSparkContext sparkContext = new JavaSparkContext(sparkConf)){
SQLContext sqlContext = new SQLContext(sparkContext);
DataFrameReader reader = sqlContext.read()
.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true");
DataFrame dataFrame = reader.load(this.csvInput.getAbsolutePath());
StructType schema = dataFrame.schema();
System.out.println(schema.treeString());
Pipeline pipeline = createPipeline(this.function, this.formula);
PipelineModel pipelineModel = pipeline.fit(dataFrame);
PMML pmml = ConverterUtil.toPMML(schema, pipelineModel);
try(OutputStream os = new FileOutputStream(this.pmmlOutput.getAbsolutePath())){
MetroJAXBUtil.marshalPMML(pmml, os);
}
}
}
示例4: CMMModel
import org.apache.spark.ml.PipelineModel; //导入依赖的package包/类
/**
* Creates a conditional Markov model.
* @param pipelineModel
* @param weights
* @param markovOrder
*/
public CMMModel(PipelineModel pipelineModel, Vector weights, MarkovOrder markovOrder, Map<String, Set<Integer>> tagDictionary) {
this.pipelineModel = pipelineModel;
this.contextExtractor = new ContextExtractor(markovOrder, Constants.REGEXP_FILE);
this.weights = weights;
this.tags = ((StringIndexerModel)(pipelineModel.stages()[2])).labels();
String[] features = ((CountVectorizerModel)(pipelineModel.stages()[1])).vocabulary();
featureMap = new HashMap<String, Integer>();
for (int j = 0; j < features.length; j++) {
featureMap.put(features[j], j);
}
this.tagDictionary = tagDictionary;
}
示例5: load
import org.apache.spark.ml.PipelineModel; //导入依赖的package包/类
@Override
public CMMModel load(String path) {
org.apache.spark.ml.util.DefaultParamsReader.Metadata metadata = DefaultParamsReader.loadMetadata(path, sc(), CMMModel.class.getName());
String pipelinePath = new Path(path, "pipelineModel").toString();
PipelineModel pipelineModel = PipelineModel.load(pipelinePath);
String dataPath = new Path(path, "data").toString();
DataFrame df = sqlContext().read().format("parquet").load(dataPath);
Row row = df.select("markovOrder", "weights", "tagDictionary").head();
// load the Markov order
MarkovOrder order = MarkovOrder.values()[row.getInt(0)-1];
// load the weight vector
Vector w = row.getAs(1);
// load the tag dictionary
@SuppressWarnings("unchecked")
scala.collection.immutable.HashMap<String, WrappedArray<Integer>> td = (scala.collection.immutable.HashMap<String, WrappedArray<Integer>>)row.get(2);
Map<String, Set<Integer>> tagDict = new HashMap<String, Set<Integer>>();
Iterator<Tuple2<String, WrappedArray<Integer>>> iterator = td.iterator();
while (iterator.hasNext()) {
Tuple2<String, WrappedArray<Integer>> tuple = iterator.next();
Set<Integer> labels = new HashSet<Integer>();
scala.collection.immutable.List<Integer> list = tuple._2().toList();
for (int i = 0; i < list.size(); i++)
labels.add(list.apply(i));
tagDict.put(tuple._1(), labels);
}
// build a CMM model
CMMModel model = new CMMModel(pipelineModel, w, order, tagDict);
DefaultParamsReader.getAndSetParams(model, metadata);
return model;
}
示例6: TransitionBasedParserMLP
import org.apache.spark.ml.PipelineModel; //导入依赖的package包/类
/**
* Creates a transition-based parser using a MLP transition classifier.
* @param jsc
* @param classifierFileName
* @param featureFrame
*/
public TransitionBasedParserMLP(JavaSparkContext jsc, String classifierFileName, FeatureFrame featureFrame) {
this.featureFrame = featureFrame;
this.classifier = TransitionClassifier.load(jsc, new Path(classifierFileName, "data").toString());
this.pipelineModel = PipelineModel.load(new Path(classifierFileName, "pipelineModel").toString());
this.transitionName = ((StringIndexerModel)pipelineModel.stages()[2]).labels();
String[] features = ((CountVectorizerModel)(pipelineModel.stages()[1])).vocabulary();
this.featureMap = new HashMap<String, Integer>();
for (int j = 0; j < features.length; j++) {
this.featureMap.put(features[j], j);
}
}
示例7: getModelInfo
import org.apache.spark.ml.PipelineModel; //导入依赖的package包/类
@Override
public PipelineModelInfo getModelInfo(final PipelineModel from) {
final PipelineModelInfo modelInfo = new PipelineModelInfo();
final ModelInfo stages[] = new ModelInfo[from.stages().length];
for (int i = 0; i < from.stages().length; i++) {
Transformer sparkModel = from.stages()[i];
stages[i] = ModelInfoAdapterFactory.getAdapter(sparkModel.getClass()).adapt(sparkModel);
}
modelInfo.setStages(stages);
return modelInfo;
}
示例8: getModelInfo
import org.apache.spark.ml.PipelineModel; //导入依赖的package包/类
@Override
public PipelineModelInfo getModelInfo(final PipelineModel from, final DataFrame df) {
final PipelineModelInfo modelInfo = new PipelineModelInfo();
final ModelInfo stages[] = new ModelInfo[from.stages().length];
for (int i = 0; i < from.stages().length; i++) {
Transformer sparkModel = from.stages()[i];
stages[i] = ModelInfoAdapterFactory.getAdapter(sparkModel.getClass()).adapt(sparkModel, df);
}
modelInfo.setStages(stages);
return modelInfo;
}
示例9: build
import org.apache.spark.ml.PipelineModel; //导入依赖的package包/类
public Transformer build(){
Evaluator evaluator = getEvaluator();
PMMLTransformer pmmlTransformer = new PMMLTransformer(evaluator, this.columnProducers);
if(this.exploded){
ColumnExploder columnExploder = new ColumnExploder(pmmlTransformer.getOutputCol());
ColumnPruner columnPruner = new ColumnPruner(ScalaUtil.singletonSet(pmmlTransformer.getOutputCol()));
PipelineModel pipelineModel = new PipelineModel(null, new Transformer[]{pmmlTransformer, columnExploder, columnPruner});
return pipelineModel;
}
return pmmlTransformer;
}
示例10: getSource
import org.apache.spark.ml.PipelineModel; //导入依赖的package包/类
@Override
public Class<PipelineModel> getSource() {
return PipelineModel.class;
}
示例11: testDecisionTreeRegressionPrediction
import org.apache.spark.ml.PipelineModel; //导入依赖的package包/类
@Test
public void testDecisionTreeRegressionPrediction() {
// Load the data stored in LIBSVM format as a DataFrame.
String datapath = "src/test/resources/regression_test.libsvm";
Dataset<Row> data = spark.read().format("libsvm").load(datapath);
// Split the data into training and test sets (30% held out for testing)
Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3});
Dataset<Row> trainingData = splits[0];
Dataset<Row> testData = splits[1];
StringIndexer indexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("labelIndex").setHandleInvalid("skip");
DecisionTreeRegressor regressionModel =
new DecisionTreeRegressor().setLabelCol("labelIndex").setFeaturesCol("features");
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[]{indexer, regressionModel});
PipelineModel sparkPipeline = pipeline.fit(trainingData);
byte[] exportedModel = ModelExporter.export(sparkPipeline);
Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
List<Row> output = sparkPipeline.transform(testData).select("features", "prediction", "label").collectAsList();
//compare predictions
for (Row row : output) {
Map<String, Object> data_ = new HashMap<>();
data_.put("features", ((SparseVector) row.get(0)).toArray());
data_.put("label", (row.get(2)).toString());
transformer.transform(data_);
System.out.println(data_);
System.out.println(data_.get("prediction"));
assertEquals((double)data_.get("prediction"), (double)row.get(1), EPSILON);
}
}
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:42,代码来源:DecisionTreeRegressionModelBridgePipelineTest.java
示例12: testGradientBoostClassification
import org.apache.spark.ml.PipelineModel; //导入依赖的package包/类
@Test
public void testGradientBoostClassification() {
// Load the data stored in LIBSVM format as a DataFrame.
String datapath = "src/test/resources/binary_classification_test.libsvm";
Dataset<Row> data = spark.read().format("libsvm").load(datapath);
StringIndexer indexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("labelIndex");
// Split the data into training and test sets (30% held out for testing)
Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3});
Dataset<Row> trainingData = splits[0];
Dataset<Row> testData = splits[1];
// Train a RandomForest model.
GBTClassifier classificationModel = new GBTClassifier().setLabelCol("labelIndex")
.setFeaturesCol("features");;
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[]{indexer, classificationModel});
PipelineModel sparkPipeline = pipeline.fit(trainingData);
// Export this model
byte[] exportedModel = ModelExporter.export(sparkPipeline);
// Import and get Transformer
Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
List<Row> sparkOutput = sparkPipeline.transform(testData).select("features", "prediction", "label").collectAsList();
// compare predictions
for (Row row : sparkOutput) {
Map<String, Object> data_ = new HashMap<>();
data_.put("features", ((SparseVector) row.get(0)).toArray());
data_.put("label", (row.get(2)).toString());
transformer.transform(data_);
System.out.println(data_);
System.out.println(data_.get("prediction")+" ,"+row.get(1));
assertEquals((double) data_.get("prediction"), (double) row.get(1), EPSILON);
}
}
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:45,代码来源:GradientBoostClassificationModelPipelineTest.java
示例13: testDecisionTreeClassificationWithPipeline
import org.apache.spark.ml.PipelineModel; //导入依赖的package包/类
@Test
public void testDecisionTreeClassificationWithPipeline() {
// Load the data stored in LIBSVM format as a DataFrame.
String datapath = "src/test/resources/classification_test.libsvm";
Dataset<Row> data = spark.read().format("libsvm").load(datapath);
// Split the data into training and test sets (30% held out for testing)
Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3});
Dataset<Row> trainingData = splits[0];
Dataset<Row> testData = splits[1];
StringIndexer indexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("labelIndex");
// Train a DecisionTree model.
DecisionTreeClassifier classificationModel = new DecisionTreeClassifier()
.setLabelCol("labelIndex")
.setFeaturesCol("features");
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[]{indexer, classificationModel});
// Train model. This also runs the indexer.
PipelineModel sparkPipeline = pipeline.fit(trainingData);
//Export this model
byte[] exportedModel = ModelExporter.export(sparkPipeline);
//Import and get Transformer
Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
List<Row> output = sparkPipeline.transform(testData).select("features", "label","prediction","rawPrediction").collectAsList();
//compare predictions
for (Row row : output) {
Map<String, Object> data_ = new HashMap<>();
double [] actualRawPrediction = ((DenseVector) row.get(3)).toArray();
data_.put("features", ((SparseVector) row.get(0)).toArray());
data_.put("label", (row.get(1)).toString());
transformer.transform(data_);
System.out.println(data_);
System.out.println(data_.get("prediction"));
assertEquals((double)data_.get("prediction"), (double)row.get(2), EPSILON);
assertArrayEquals((double[]) data_.get("rawPrediction"), actualRawPrediction, EPSILON);
}
}
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:54,代码来源:DecisionTreeClassificationModelBridgePipelineTest.java
示例14: testPipeline
import org.apache.spark.ml.PipelineModel; //导入依赖的package包/类
@Test
public void testPipeline() {
// Prepare training documents, which are labeled.
StructType schema = createStructType(new StructField[]{
createStructField("id", LongType, false),
createStructField("text", StringType, false),
createStructField("label", DoubleType, false)
});
Dataset<Row> trainingData = spark.createDataFrame(Arrays.asList(
cr(0L, "a b c d e spark", 1.0),
cr(1L, "b d", 0.0),
cr(2L, "spark f g h", 1.0),
cr(3L, "hadoop mapreduce", 0.0)
), schema);
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and LogisticRegression.
RegexTokenizer tokenizer = new RegexTokenizer()
.setInputCol("text")
.setOutputCol("words")
.setPattern("\\s")
.setGaps(true)
.setToLowercase(false);
HashingTF hashingTF = new HashingTF()
.setNumFeatures(1000)
.setInputCol(tokenizer.getOutputCol())
.setOutputCol("features");
LogisticRegression lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(0.01);
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[]{tokenizer, hashingTF, lr});
// Fit the pipeline to training documents.
PipelineModel sparkPipelineModel = pipeline.fit(trainingData);
//Export this model
byte[] exportedModel = ModelExporter.export(sparkPipelineModel);
System.out.println(new String(exportedModel));
//Import and get Transformer
Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
//prepare test data
StructType testSchema = createStructType(new StructField[]{
createStructField("id", LongType, false),
createStructField("text", StringType, false),
});
Dataset<Row> testData = spark.createDataFrame(Arrays.asList(
cr(4L, "spark i j k"),
cr(5L, "l m n"),
cr(6L, "mapreduce spark"),
cr(7L, "apache hadoop")
), testSchema);
//verify that predictions for spark pipeline and exported pipeline are the same
List<Row> predictions = sparkPipelineModel.transform(testData).select("id", "text", "probability", "prediction").collectAsList();
for (Row r : predictions) {
System.out.println(r);
double sparkPipelineOp = r.getDouble(3);
Map<String, Object> data = new HashMap<String, Object>();
data.put("text", r.getString(1));
transformer.transform(data);
double exportedPipelineOp = (double) data.get("prediction");
double exportedPipelineProb = (double) data.get("probability");
assertEquals(sparkPipelineOp, exportedPipelineOp, 0.01);
}
}
示例15: testRandomForestRegressionWithPipeline
import org.apache.spark.ml.PipelineModel; //导入依赖的package包/类
@Test
public void testRandomForestRegressionWithPipeline() {
// Load the data stored in LIBSVM format as a DataFrame.
DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/regression_test.libsvm");
// Split the data into training and test sets (30% held out for testing)
DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
DataFrame trainingData = splits[0];
DataFrame testData = splits[1];
// Train a RandomForest model.
RandomForestRegressionModel regressionModel = new RandomForestRegressor()
.setFeaturesCol("features").fit(trainingData);
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[]{regressionModel});
// Train model. This also runs the indexer.
PipelineModel sparkPipeline = pipeline.fit(trainingData);
//Export this model
byte[] exportedModel = ModelExporter.export(sparkPipeline, null);
//Import and get Transformer
Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
Row[] sparkOutput = sparkPipeline.transform(testData).select("features", "prediction").collect();
//compare predictions
for (Row row : sparkOutput) {
Vector v = (Vector) row.get(0);
double actual = row.getDouble(1);
Map<String, Object> inputData = new HashMap<String, Object>();
inputData.put(transformer.getInputKeys().iterator().next(), v.toArray());
transformer.transform(inputData);
double predicted = (double) inputData.get(transformer.getOutputKeys().iterator().next());
assertEquals(actual, predicted, EPSILON);
}
}
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:42,代码来源:RandomForestRegressionModelInfoAdapterBridgeTest.java