本文整理汇总了Java中org.apache.spark.sql.types.Metadata类的典型用法代码示例。如果您正苦于以下问题:Java Metadata类的具体用法?Java Metadata怎么用?Java Metadata使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
Metadata类属于org.apache.spark.sql.types包,在下文中一共展示了Metadata类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: createNGramDataFrame
import org.apache.spark.sql.types.Metadata; //导入依赖的package包/类
/**
* Creates a n-gram data frame from text lines.
* @param lines
* @return a n-gram data frame.
*/
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
private static final long serialVersionUID = -4332903997027358601L;
@Override
public Row call(String line) throws Exception {
return RowFactory.create(Arrays.asList(line.split("\\s+")));
}
});
StructType schema = new StructType(new StructField[] {
new StructField("words",
DataTypes.createArrayType(DataTypes.StringType), false,
Metadata.empty()) });
DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
// build a bigram language model
NGram transformer = new NGram().setInputCol("words")
.setOutputCol("ngrams").setN(2);
DataFrame ngramDF = transformer.transform(wordDF);
ngramDF.show(10, false);
return ngramDF;
}
示例2: test_getDataSetResult
import org.apache.spark.sql.types.Metadata; //导入依赖的package包/类
@Test
public void test_getDataSetResult() {
StructField[] structFields = new StructField[]{
new StructField("intColumn", DataTypes.IntegerType, true, Metadata.empty()),
new StructField("stringColumn", DataTypes.StringType, true, Metadata.empty())
};
StructType structType = new StructType(structFields);
List<Row> rows = new ArrayList<>();
rows.add(RowFactory.create(1, "v1"));
rows.add(RowFactory.create(2, "v2"));
Dataset<Row> df = sparkSession.createDataFrame(rows, structType);
DataSetResult dataSetResult = SparkUtils.getDataSetResult(df);
Assert.assertEquals(2, dataSetResult.getColumnNames().size());
Assert.assertEquals(2, dataSetResult.getRows().size());
Assert.assertEquals(new Integer(1), dataSetResult.getRows().get(0).get(0));
Assert.assertEquals("v1", dataSetResult.getRows().get(0).get(1));
Assert.assertEquals(new Integer(2), dataSetResult.getRows().get(1).get(0));
Assert.assertEquals("v2", dataSetResult.getRows().get(1).get(1));
}
示例3: generateData_week_timepoints_by_10_minutes
import org.apache.spark.sql.types.Metadata; //导入依赖的package包/类
private static Dataset<Row> generateData_week_timepoints_by_10_minutes(SparkSession spark) {
StructField[] structFields = new StructField[1];
org.apache.spark.sql.types.DataType dataType = DataTypes.IntegerType;
String column = "timepoint";
StructField structField = new StructField(column, dataType, true, Metadata.empty());
structFields[0] = structField;
StructType structType = new StructType(structFields);
List<Row> rows = new ArrayList<>();
int weekTotalMinutes = 7 * 24 * 60;
int timepointIntervalMinutes = 10;
for (int i = 0; i < weekTotalMinutes / timepointIntervalMinutes; i++) {
Object[] objects = new Object[structFields.length];
objects[0] = i;
Row row = RowFactory.create(objects);
rows.add(row);
}
Dataset<Row> df = spark.createDataFrame(rows, structType);
return df;
}
示例4: parse
import org.apache.spark.sql.types.Metadata; //导入依赖的package包/类
/**
* Parses a list of PoS-tagged sentences, each on a line and writes the result to an output
* file in a specified output format.
* @param jsc
* @param sentences
* @param outputFileName
* @param outuptFormat
*/
public void parse(JavaSparkContext jsc, List<String> sentences, String outputFileName, OutputFormat outputFormat) {
JavaRDD<String> input = jsc.parallelize(sentences);
JavaRDD<Sentence> sents = input.map(new TaggedLineToSentenceFunction());
JavaRDD<DependencyGraph> graphs = sents.map(new ParsingFunction());
JavaRDD<Row> rows = graphs.map(new Function<DependencyGraph, Row>() {
private static final long serialVersionUID = -812004521983071103L;
public Row call(DependencyGraph graph) {
return RowFactory.create(graph.getSentence().toString(), graph.dependencies());
}
});
StructType schema = new StructType(new StructField[]{
new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),
new StructField("dependency", DataTypes.StringType, false, Metadata.empty())
});
SQLContext sqlContext = new SQLContext(jsc);
DataFrame df = sqlContext.createDataFrame(rows, schema);
if (outputFormat == OutputFormat.TEXT)
df.select("dependency").write().text(outputFileName);
else
df.repartition(1).write().json(outputFileName);
}
示例5: fromSchema
import org.apache.spark.sql.types.Metadata; //导入依赖的package包/类
/**
* Convert a datavec schema to a
* struct type in spark
*
* @param schema the schema to convert
* @return the datavec struct type
*/
public static StructType fromSchema(Schema schema) {
StructField[] structFields = new StructField[schema.numColumns()];
for (int i = 0; i < structFields.length; i++) {
switch (schema.getColumnTypes().get(i)) {
case Double:
structFields[i] = new StructField(schema.getName(i), DataTypes.DoubleType, false, Metadata.empty());
break;
case Integer:
structFields[i] =
new StructField(schema.getName(i), DataTypes.IntegerType, false, Metadata.empty());
break;
case Long:
structFields[i] = new StructField(schema.getName(i), DataTypes.LongType, false, Metadata.empty());
break;
case Float:
structFields[i] = new StructField(schema.getName(i), DataTypes.FloatType, false, Metadata.empty());
break;
default:
throw new IllegalStateException(
"This api should not be used with strings , binary data or ndarrays. This is only for columnar data");
}
}
return new StructType(structFields);
}
示例6: generateData_numbers_1k
import org.apache.spark.sql.types.Metadata; //导入依赖的package包/类
private static Dataset<Row> generateData_numbers_1k(SparkSession spark) {
StructField[] structFields = new StructField[1];
org.apache.spark.sql.types.DataType dataType = DataTypes.IntegerType;
String column = "number";
StructField structField = new StructField(column, dataType, true, Metadata.empty());
structFields[0] = structField;
StructType structType = new StructType(structFields);
List<Row> rows = new ArrayList<>();
for (int i = 0; i <= 1000; i++) {
Object[] objects = new Object[structFields.length];
objects[0] = i;
Row row = RowFactory.create(objects);
rows.add(row);
}
Dataset<Row> df = spark.createDataFrame(rows, structType);
return df;
}
示例7: fromSchemaSequence
import org.apache.spark.sql.types.Metadata; //导入依赖的package包/类
/**
* Convert the DataVec sequence schema to a StructType for Spark, for example for use in
* {@link #toDataFrameSequence(Schema, JavaRDD)}}
* <b>Note</b>: as per {@link #toDataFrameSequence(Schema, JavaRDD)}}, the StructType has two additional columns added to it:<br>
* - Column 0: Sequence UUID (name: {@link #SEQUENCE_UUID_COLUMN}) - a UUID for the original sequence<br>
* - Column 1: Sequence index (name: {@link #SEQUENCE_INDEX_COLUMN} - an index (integer, starting at 0) for the position
* of this record in the original time series.<br>
* These two columns are required if the data is to be converted back into a sequence at a later point, for example
* using {@link #toRecordsSequence(DataRowsFacade)}
*
* @param schema Schema to convert
* @return StructType for the schema
*/
public static StructType fromSchemaSequence(Schema schema) {
StructField[] structFields = new StructField[schema.numColumns() + 2];
structFields[0] = new StructField(SEQUENCE_UUID_COLUMN, DataTypes.StringType, false, Metadata.empty());
structFields[1] = new StructField(SEQUENCE_INDEX_COLUMN, DataTypes.IntegerType, false, Metadata.empty());
for (int i = 0; i < schema.numColumns(); i++) {
switch (schema.getColumnTypes().get(i)) {
case Double:
structFields[i + 2] =
new StructField(schema.getName(i), DataTypes.DoubleType, false, Metadata.empty());
break;
case Integer:
structFields[i + 2] =
new StructField(schema.getName(i), DataTypes.IntegerType, false, Metadata.empty());
break;
case Long:
structFields[i + 2] =
new StructField(schema.getName(i), DataTypes.LongType, false, Metadata.empty());
break;
case Float:
structFields[i + 2] =
new StructField(schema.getName(i), DataTypes.FloatType, false, Metadata.empty());
break;
default:
throw new IllegalStateException(
"This api should not be used with strings , binary data or ndarrays. This is only for columnar data");
}
}
return new StructType(structFields);
}
示例8: testAgeRangeFloat
import org.apache.spark.sql.types.Metadata; //导入依赖的package包/类
@Test
public void testAgeRangeFloat() {
StructType schema = new StructType(new StructField[] {
new StructField("name", DataTypes.StringType, false, Metadata.empty()),
new StructField("nickname", DataTypes.StringType, false, Metadata.empty()),
new StructField("age", DataTypes.FloatType, false, Metadata.empty()),
new StructField("candycrushscore", DataTypes.createDecimalType(), false, Metadata.empty())
});
Map<String, Object> configMap = new HashMap<>();
configMap.put("fields", Lists.newArrayList("age"));
configMap.put("fieldtype", "float");
configMap.put("range", Lists.newArrayList(0.1,105.0));
Config config = ConfigFactory.parseMap(configMap);
RangeRowRule rule = new RangeRowRule();
rule.configure("agerange", config);
Row row1 = new RowWithSchema(schema, "Ian", "Ian", 34.0f, new BigDecimal("0.00"));
assertTrue("Row should pass rule", rule.check(row1));
Row row2 = new RowWithSchema(schema, "Webster1", "Websta1", 110.0f, new BigDecimal("450.10"));
assertFalse("Row should not pass rule", rule.check(row2));
Row row3 = new RowWithSchema(schema, "", "Ian1", 110.0f, new BigDecimal("450.10"));
assertFalse("Row should not pass rule", rule.check(row3));
Row row4 = new RowWithSchema(schema, "First Last", "Ian Last", 100.0f, new BigDecimal("450.10"));
assertTrue("Row should pass rule", rule.check(row4));
}
示例9: testPruneByStepValueTrue
import org.apache.spark.sql.types.Metadata; //导入依赖的package包/类
@Test
public void testPruneByStepValueTrue() {
StructType schema = new StructType(new StructField[] {
new StructField("outcome", DataTypes.BooleanType, false, Metadata.empty())
});
List<Row> rows = Lists.newArrayList(
RowFactory.create(true)
);
Dataset<Row> ds = Contexts.getSparkSession().createDataFrame(rows, schema);
step1.setData(ds);
Map<String, Object> step2ConfigMap = Maps.newHashMap();
step2ConfigMap.put("dependencies", Lists.newArrayList("step1"));
step2ConfigMap.put(DecisionStep.IF_TRUE_STEP_NAMES_PROPERTY, Lists.newArrayList("step3", "step7"));
step2ConfigMap.put(DecisionStep.DECISION_METHOD_PROPERTY, DecisionStep.STEP_BY_VALUE_DECISION_METHOD);
step2ConfigMap.put(DecisionStep.STEP_BY_VALUE_STEP_PROPERTY, "step1");
Config step2Config = ConfigFactory.parseMap(step2ConfigMap);
RefactorStep step2 = new DecisionStep("step2", step2Config);
steps.add(step2);
Set<Step> refactored = step2.refactor(steps);
assertEquals(refactored, Sets.newHashSet(step1, step2, step3, step4, step7, step8));
}
示例10: testPruneByStepValueFalse
import org.apache.spark.sql.types.Metadata; //导入依赖的package包/类
@Test
public void testPruneByStepValueFalse() {
StructType schema = new StructType(new StructField[] {
new StructField("outcome", DataTypes.BooleanType, false, Metadata.empty())
});
List<Row> rows = Lists.newArrayList(
RowFactory.create(false)
);
Dataset<Row> ds = Contexts.getSparkSession().createDataFrame(rows, schema);
step1.setData(ds);
Map<String, Object> step2ConfigMap = Maps.newHashMap();
step2ConfigMap.put("dependencies", Lists.newArrayList("step1"));
step2ConfigMap.put(DecisionStep.IF_TRUE_STEP_NAMES_PROPERTY, Lists.newArrayList("step3", "step7"));
step2ConfigMap.put(DecisionStep.DECISION_METHOD_PROPERTY, DecisionStep.STEP_BY_VALUE_DECISION_METHOD);
step2ConfigMap.put(DecisionStep.STEP_BY_VALUE_STEP_PROPERTY, "step1");
Config step2Config = ConfigFactory.parseMap(step2ConfigMap);
RefactorStep step2 = new DecisionStep("step2", step2Config);
steps.add(step2);
Set<Step> refactored = step2.refactor(steps);
assertEquals(refactored, Sets.newHashSet(step1, step2, step5, step6));
}
示例11: getSchemaType
import org.apache.spark.sql.types.Metadata; //导入依赖的package包/类
@Override public DataType getSchemaType() {
StructField[] fields = new StructField[children.size()];
for (int i = 0; i < children.size(); i++) {
fields[i] = new StructField(children.get(i).getName(), null, true,
Metadata.empty());
}
return new StructType(fields);
}
示例12: createDataFrame
import org.apache.spark.sql.types.Metadata; //导入依赖的package包/类
/**
* Creates a data frame from a list of tagged sentences.
* @param taggedSentences
* @return a data frame of two columns: "sentence" and "partOfSpeech".
*/
public DataFrame createDataFrame(List<String> taggedSentences) {
List<String> wordSequences = new LinkedList<String>();
List<String> tagSequences = new LinkedList<String>();
for (String taggedSentence : taggedSentences) {
StringBuilder wordBuf = new StringBuilder();
StringBuilder tagBuf = new StringBuilder();
String[] tokens = taggedSentence.split("\\s+");
for (String token : tokens) {
String[] parts = token.split("/");
if (parts.length == 2) {
wordBuf.append(parts[0]);
wordBuf.append(' ');
tagBuf.append(parts[1]);
tagBuf.append(' ');
} else { // this token is "///"
wordBuf.append('/');
wordBuf.append(' ');
tagBuf.append('/');
tagBuf.append(' ');
}
}
wordSequences.add(wordBuf.toString().trim());
tagSequences.add(tagBuf.toString().trim());
}
if (verbose) {
System.out.println("Number of sentences = " + wordSequences.size());
}
List<Row> rows = new LinkedList<Row>();
for (int i = 0; i < wordSequences.size(); i++) {
rows.add(RowFactory.create(wordSequences.get(i), tagSequences.get(i)));
}
JavaRDD<Row> jrdd = jsc.parallelize(rows);
StructType schema = new StructType(new StructField[]{
new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),
new StructField("partOfSpeech", DataTypes.StringType, false, Metadata.empty())
});
return new SQLContext(jsc).createDataFrame(jrdd, schema);
}
示例13: tag
import org.apache.spark.sql.types.Metadata; //导入依赖的package包/类
/**
* Tags a list of sequences and returns a list of tag sequences.
* @param sentences
* @return a list of tagged sequences.
*/
public List<String> tag(List<String> sentences) {
List<Row> rows = new LinkedList<Row>();
for (String sentence : sentences) {
rows.add(RowFactory.create(sentence));
}
StructType schema = new StructType(new StructField[]{
new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
});
SQLContext sqlContext = new SQLContext(jsc);
DataFrame input = sqlContext.createDataFrame(rows, schema);
if (cmmModel != null) {
DataFrame output = cmmModel.transform(input).repartition(1);
return output.javaRDD().map(new RowToStringFunction(1)).collect();
} else {
System.err.println("Tagging model is null. You need to create or load a model first.");
return null;
}
}
示例14: transform
import org.apache.spark.sql.types.Metadata; //导入依赖的package包/类
@Override
public DataFrame transform(DataFrame dataset) {
JavaRDD<Row> output = dataset.javaRDD().map(new DecodeFunction());
StructType schema = new StructType(new StructField[]{
new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),
new StructField("prediction", DataTypes.StringType, false, Metadata.empty())
});
return dataset.sqlContext().createDataFrame(output, schema);
}
示例15: load
import org.apache.spark.sql.types.Metadata; //导入依赖的package包/类
@Override
public CMMModel load(String path) {
org.apache.spark.ml.util.DefaultParamsReader.Metadata metadata = DefaultParamsReader.loadMetadata(path, sc(), CMMModel.class.getName());
String pipelinePath = new Path(path, "pipelineModel").toString();
PipelineModel pipelineModel = PipelineModel.load(pipelinePath);
String dataPath = new Path(path, "data").toString();
DataFrame df = sqlContext().read().format("parquet").load(dataPath);
Row row = df.select("markovOrder", "weights", "tagDictionary").head();
// load the Markov order
MarkovOrder order = MarkovOrder.values()[row.getInt(0)-1];
// load the weight vector
Vector w = row.getAs(1);
// load the tag dictionary
@SuppressWarnings("unchecked")
scala.collection.immutable.HashMap<String, WrappedArray<Integer>> td = (scala.collection.immutable.HashMap<String, WrappedArray<Integer>>)row.get(2);
Map<String, Set<Integer>> tagDict = new HashMap<String, Set<Integer>>();
Iterator<Tuple2<String, WrappedArray<Integer>>> iterator = td.iterator();
while (iterator.hasNext()) {
Tuple2<String, WrappedArray<Integer>> tuple = iterator.next();
Set<Integer> labels = new HashSet<Integer>();
scala.collection.immutable.List<Integer> list = tuple._2().toList();
for (int i = 0; i < list.size(); i++)
labels.add(list.apply(i));
tagDict.put(tuple._1(), labels);
}
// build a CMM model
CMMModel model = new CMMModel(pipelineModel, w, order, tagDict);
DefaultParamsReader.getAndSetParams(model, metadata);
return model;
}