本文整理汇总了Java中org.apache.spark.sql.types.StructField类的典型用法代码示例。如果您正苦于以下问题:Java StructField类的具体用法?Java StructField怎么用?Java StructField使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
StructField类属于org.apache.spark.sql.types包,在下文中一共展示了StructField类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: createNGramDataFrame
import org.apache.spark.sql.types.StructField; //导入依赖的package包/类
/**
* Creates a n-gram data frame from text lines.
* @param lines
* @return a n-gram data frame.
*/
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
private static final long serialVersionUID = -4332903997027358601L;
@Override
public Row call(String line) throws Exception {
return RowFactory.create(Arrays.asList(line.split("\\s+")));
}
});
StructType schema = new StructType(new StructField[] {
new StructField("words",
DataTypes.createArrayType(DataTypes.StringType), false,
Metadata.empty()) });
DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
// build a bigram language model
NGram transformer = new NGram().setInputCol("words")
.setOutputCol("ngrams").setN(2);
DataFrame ngramDF = transformer.transform(wordDF);
ngramDF.show(10, false);
return ngramDF;
}
示例2: test_getDataSetResult
import org.apache.spark.sql.types.StructField; //导入依赖的package包/类
@Test
public void test_getDataSetResult() {
StructField[] structFields = new StructField[]{
new StructField("intColumn", DataTypes.IntegerType, true, Metadata.empty()),
new StructField("stringColumn", DataTypes.StringType, true, Metadata.empty())
};
StructType structType = new StructType(structFields);
List<Row> rows = new ArrayList<>();
rows.add(RowFactory.create(1, "v1"));
rows.add(RowFactory.create(2, "v2"));
Dataset<Row> df = sparkSession.createDataFrame(rows, structType);
DataSetResult dataSetResult = SparkUtils.getDataSetResult(df);
Assert.assertEquals(2, dataSetResult.getColumnNames().size());
Assert.assertEquals(2, dataSetResult.getRows().size());
Assert.assertEquals(new Integer(1), dataSetResult.getRows().get(0).get(0));
Assert.assertEquals("v1", dataSetResult.getRows().get(0).get(1));
Assert.assertEquals(new Integer(2), dataSetResult.getRows().get(1).get(0));
Assert.assertEquals("v2", dataSetResult.getRows().get(1).get(1));
}
示例3: generateData_week_timepoints_by_10_minutes
import org.apache.spark.sql.types.StructField; //导入依赖的package包/类
private static Dataset<Row> generateData_week_timepoints_by_10_minutes(SparkSession spark) {
StructField[] structFields = new StructField[1];
org.apache.spark.sql.types.DataType dataType = DataTypes.IntegerType;
String column = "timepoint";
StructField structField = new StructField(column, dataType, true, Metadata.empty());
structFields[0] = structField;
StructType structType = new StructType(structFields);
List<Row> rows = new ArrayList<>();
int weekTotalMinutes = 7 * 24 * 60;
int timepointIntervalMinutes = 10;
for (int i = 0; i < weekTotalMinutes / timepointIntervalMinutes; i++) {
Object[] objects = new Object[structFields.length];
objects[0] = i;
Row row = RowFactory.create(objects);
rows.add(row);
}
Dataset<Row> df = spark.createDataFrame(rows, structType);
return df;
}
示例4: parse
import org.apache.spark.sql.types.StructField; //导入依赖的package包/类
/**
* Parses a list of PoS-tagged sentences, each on a line and writes the result to an output
* file in a specified output format.
* @param jsc
* @param sentences
* @param outputFileName
* @param outuptFormat
*/
public void parse(JavaSparkContext jsc, List<String> sentences, String outputFileName, OutputFormat outputFormat) {
JavaRDD<String> input = jsc.parallelize(sentences);
JavaRDD<Sentence> sents = input.map(new TaggedLineToSentenceFunction());
JavaRDD<DependencyGraph> graphs = sents.map(new ParsingFunction());
JavaRDD<Row> rows = graphs.map(new Function<DependencyGraph, Row>() {
private static final long serialVersionUID = -812004521983071103L;
public Row call(DependencyGraph graph) {
return RowFactory.create(graph.getSentence().toString(), graph.dependencies());
}
});
StructType schema = new StructType(new StructField[]{
new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),
new StructField("dependency", DataTypes.StringType, false, Metadata.empty())
});
SQLContext sqlContext = new SQLContext(jsc);
DataFrame df = sqlContext.createDataFrame(rows, schema);
if (outputFormat == OutputFormat.TEXT)
df.select("dependency").write().text(outputFileName);
else
df.repartition(1).write().json(outputFileName);
}
示例5: fromSchema
import org.apache.spark.sql.types.StructField; //导入依赖的package包/类
/**
* Convert a datavec schema to a
* struct type in spark
*
* @param schema the schema to convert
* @return the datavec struct type
*/
public static StructType fromSchema(Schema schema) {
StructField[] structFields = new StructField[schema.numColumns()];
for (int i = 0; i < structFields.length; i++) {
switch (schema.getColumnTypes().get(i)) {
case Double:
structFields[i] = new StructField(schema.getName(i), DataTypes.DoubleType, false, Metadata.empty());
break;
case Integer:
structFields[i] =
new StructField(schema.getName(i), DataTypes.IntegerType, false, Metadata.empty());
break;
case Long:
structFields[i] = new StructField(schema.getName(i), DataTypes.LongType, false, Metadata.empty());
break;
case Float:
structFields[i] = new StructField(schema.getName(i), DataTypes.FloatType, false, Metadata.empty());
break;
default:
throw new IllegalStateException(
"This api should not be used with strings , binary data or ndarrays. This is only for columnar data");
}
}
return new StructType(structFields);
}
示例6: init
import org.apache.spark.sql.types.StructField; //导入依赖的package包/类
@Override
public StructField init(Evaluator evaluator){
OutputField field = getField();
DataType dataType = field.getDataType();
if(dataType == null){
try {
dataType = OutputUtil.getDataType(field.getOutputField(), (ModelEvaluator<?>)evaluator);
this.formatString = false;
} catch(PMMLException pe){
dataType = DataType.STRING;
this.formatString = true;
}
}
return DataTypes.createStructField(getColumnName(), SchemaUtil.translateDataType(dataType), false);
}
示例7: generateData_numbers_1k
import org.apache.spark.sql.types.StructField; //导入依赖的package包/类
private static Dataset<Row> generateData_numbers_1k(SparkSession spark) {
StructField[] structFields = new StructField[1];
org.apache.spark.sql.types.DataType dataType = DataTypes.IntegerType;
String column = "number";
StructField structField = new StructField(column, dataType, true, Metadata.empty());
structFields[0] = structField;
StructType structType = new StructType(structFields);
List<Row> rows = new ArrayList<>();
for (int i = 0; i <= 1000; i++) {
Object[] objects = new Object[structFields.length];
objects[0] = i;
Row row = RowFactory.create(objects);
rows.add(row);
}
Dataset<Row> df = spark.createDataFrame(rows, structType);
return df;
}
示例8: writeEntityMetadata
import org.apache.spark.sql.types.StructField; //导入依赖的package包/类
/**
* Write metadata describing entity tables
*
* @param entitySchema the entity schema
*/
public void writeEntityMetadata(EntitySchema entitySchema) {
// create the schema
List<StructField> fields = new ArrayList<>();
fields.add(DataTypes.createStructField(ENTITIES_NAME, DataTypes.StringType, false));
fields.add(DataTypes.createStructField(ENTITIES_URI, DataTypes.StringType, false));
fields.add(DataTypes.createStructField(ENTITIES_LABEL, DataTypes.StringType, true));
fields.add(DataTypes.createStructField(ENTITIES_NUM_ROWS, DataTypes.LongType, false));
StructType schema = DataTypes.createStructType(fields);
List<Tuple2<String, String>> indexes = new ArrayList<>();
indexes.add(new Tuple2<>(ENTITIES_TABLE_NAME, ENTITIES_URI));
List<Tuple2<String, String>> primaryKeys = new ArrayList<>();
indexes.add(new Tuple2<>(ENTITIES_TABLE_NAME, ENTITIES_NAME));
final Map<String, String> uriLabels = rdfSchema.getUriLabels();
// create table rows
List<Row> rows = entitySchema.getTables().stream()
.map(table -> {
Object[] valueArray = new Object[]{
table.getName(),
table.getTypeURI(),
uriLabels.get(table.getTypeURI()),
table.getNumRows()
};
return RowFactory.create(valueArray);
}).collect(Collectors.toList());
// create and write the META_Entities dataframe
DataFrame df = sql.createDataFrame(rows, schema);
persistor.writeDataFrame(ENTITIES_TABLE_NAME, df);
persistor.createPrimaryKeys(primaryKeys);
persistor.createIndexes(indexes);
df.unpersist();
}
示例9: getDataFrame
import org.apache.spark.sql.types.StructField; //导入依赖的package包/类
private DataFrame getDataFrame() {
StructType schema = createStructType(new StructField[]{
createStructField("id", IntegerType, false),
createStructField("a", StringType, false),
createStructField("b", DoubleType, false),
createStructField("c", DoubleType, false),
createStructField("d", BooleanType, false),
});
List<Row> trainingData = Arrays.asList(
cr(1, null, null, null, null),
cr(2, "test", 1.2, null, null),
cr(3, null, 1.1, null, false),
cr(4, "faffa", NaN, 45.0, true)
);
DataFrame df = sqlContext.createDataFrame(trainingData, schema);
return df;
}
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:21,代码来源:FillNAValuesTransformerBridgeTest.java
示例10: schemaFor
import org.apache.spark.sql.types.StructField; //导入依赖的package包/类
private static Schema schemaFor(StructType structType, String record, String namespace, int recordCount) {
// Increment after using the recordCount
record = (null == record) ? "record" + recordCount++ : record;
LOG.debug("Converting {} to Avro Record schema [{}:{}]", structType, record, namespace);
SchemaBuilder.RecordBuilder<Schema> schema = SchemaBuilder.record(record);
if (null != namespace) {
schema.namespace(namespace);
}
schema.doc("Auto-generated from Spark DataFrame");
SchemaBuilder.FieldAssembler<Schema> assembler = schema.fields();
StructField[] structFields = structType.fields();
for (StructField f : structFields) {
assembler.name(f.name()).type(typeFor(f.dataType(), f.nullable(), recordCount)).noDefault();
}
return assembler.endRecord();
}
示例11: createDataField
import org.apache.spark.sql.types.StructField; //导入依赖的package包/类
public DataField createDataField(FieldName name){
StructField field = this.schema.apply(name.getValue());
org.apache.spark.sql.types.DataType sparkDataType = field.dataType();
if(sparkDataType instanceof StringType){
return createDataField(name, OpType.CATEGORICAL, DataType.STRING);
} else
if(sparkDataType instanceof IntegralType){
return createDataField(name, OpType.CONTINUOUS, DataType.INTEGER);
} else
if(sparkDataType instanceof DoubleType){
return createDataField(name, OpType.CONTINUOUS, DataType.DOUBLE);
} else
if(sparkDataType instanceof BooleanType){
return createDataField(name, OpType.CATEGORICAL, DataType.BOOLEAN);
} else
{
throw new IllegalArgumentException("Expected string, integral, double or boolean type, got " + sparkDataType.typeName() + " type");
}
}
示例12: fromSchemaSequence
import org.apache.spark.sql.types.StructField; //导入依赖的package包/类
/**
* Convert the DataVec sequence schema to a StructType for Spark, for example for use in
* {@link #toDataFrameSequence(Schema, JavaRDD)}}
* <b>Note</b>: as per {@link #toDataFrameSequence(Schema, JavaRDD)}}, the StructType has two additional columns added to it:<br>
* - Column 0: Sequence UUID (name: {@link #SEQUENCE_UUID_COLUMN}) - a UUID for the original sequence<br>
* - Column 1: Sequence index (name: {@link #SEQUENCE_INDEX_COLUMN} - an index (integer, starting at 0) for the position
* of this record in the original time series.<br>
* These two columns are required if the data is to be converted back into a sequence at a later point, for example
* using {@link #toRecordsSequence(DataRowsFacade)}
*
* @param schema Schema to convert
* @return StructType for the schema
*/
public static StructType fromSchemaSequence(Schema schema) {
StructField[] structFields = new StructField[schema.numColumns() + 2];
structFields[0] = new StructField(SEQUENCE_UUID_COLUMN, DataTypes.StringType, false, Metadata.empty());
structFields[1] = new StructField(SEQUENCE_INDEX_COLUMN, DataTypes.IntegerType, false, Metadata.empty());
for (int i = 0; i < schema.numColumns(); i++) {
switch (schema.getColumnTypes().get(i)) {
case Double:
structFields[i + 2] =
new StructField(schema.getName(i), DataTypes.DoubleType, false, Metadata.empty());
break;
case Integer:
structFields[i + 2] =
new StructField(schema.getName(i), DataTypes.IntegerType, false, Metadata.empty());
break;
case Long:
structFields[i + 2] =
new StructField(schema.getName(i), DataTypes.LongType, false, Metadata.empty());
break;
case Float:
structFields[i + 2] =
new StructField(schema.getName(i), DataTypes.FloatType, false, Metadata.empty());
break;
default:
throw new IllegalStateException(
"This api should not be used with strings , binary data or ndarrays. This is only for columnar data");
}
}
return new StructType(structFields);
}
示例13: sparkSchemaToIndexRSchema
import org.apache.spark.sql.types.StructField; //导入依赖的package包/类
public static SegmentSchema sparkSchemaToIndexRSchema(List<StructField> sparkSchema, IsIndexed isIndexed) {
List<ColumnSchema> columns = new ArrayList<>();
for (StructField f : sparkSchema) {
SQLType type;
if (f.dataType() instanceof IntegerType) {
type = SQLType.INT;
} else if (f.dataType() instanceof LongType) {
type = SQLType.BIGINT;
} else if (f.dataType() instanceof FloatType) {
type = SQLType.FLOAT;
} else if (f.dataType() instanceof DoubleType) {
type = SQLType.DOUBLE;
} else if (f.dataType() instanceof StringType) {
type = SQLType.VARCHAR;
} else if (f.dataType() instanceof DateType) {
type = SQLType.DATE;
} else if (f.dataType() instanceof TimestampType) {
type = SQLType.DATETIME;
} else {
throw new IllegalStateException("Unsupported type: " + f.dataType());
}
columns.add(new ColumnSchema(f.name(), type, isIndexed.apply(f.name())));
}
return new SegmentSchema(columns);
}
示例14: inferSchema
import org.apache.spark.sql.types.StructField; //导入依赖的package包/类
public static List<StructField> inferSchema(List<FileStatus> files, Configuration configuration) {
try {
for (FileStatus fileStatus : files) {
Path path = fileStatus.getPath();
if (!SegmentHelper.checkSegmentByPath(path)
|| fileStatus.getLen() == 0) {
continue;
}
FileSystem fileSystem = path.getFileSystem(configuration);
ByteBufferReader.Opener opener = ByteBufferReader.Opener.create(fileSystem, path);
IntegratedSegment.Fd fd = IntegratedSegment.Fd.create(path.toString(), opener);
if (fd != null) {
return indexrSchemaToSparkSchema(fd.info().schema());
}
}
return null;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
示例15: testAgeRangeFloat
import org.apache.spark.sql.types.StructField; //导入依赖的package包/类
@Test
public void testAgeRangeFloat() {
StructType schema = new StructType(new StructField[] {
new StructField("name", DataTypes.StringType, false, Metadata.empty()),
new StructField("nickname", DataTypes.StringType, false, Metadata.empty()),
new StructField("age", DataTypes.FloatType, false, Metadata.empty()),
new StructField("candycrushscore", DataTypes.createDecimalType(), false, Metadata.empty())
});
Map<String, Object> configMap = new HashMap<>();
configMap.put("fields", Lists.newArrayList("age"));
configMap.put("fieldtype", "float");
configMap.put("range", Lists.newArrayList(0.1,105.0));
Config config = ConfigFactory.parseMap(configMap);
RangeRowRule rule = new RangeRowRule();
rule.configure("agerange", config);
Row row1 = new RowWithSchema(schema, "Ian", "Ian", 34.0f, new BigDecimal("0.00"));
assertTrue("Row should pass rule", rule.check(row1));
Row row2 = new RowWithSchema(schema, "Webster1", "Websta1", 110.0f, new BigDecimal("450.10"));
assertFalse("Row should not pass rule", rule.check(row2));
Row row3 = new RowWithSchema(schema, "", "Ian1", 110.0f, new BigDecimal("450.10"));
assertFalse("Row should not pass rule", rule.check(row3));
Row row4 = new RowWithSchema(schema, "First Last", "Ian Last", 100.0f, new BigDecimal("450.10"));
assertTrue("Row should pass rule", rule.check(row4));
}