当前位置: 首页>>代码示例>>Java>>正文


Java StructType类代码示例

本文整理汇总了Java中org.apache.spark.sql.types.StructType的典型用法代码示例。如果您正苦于以下问题:Java StructType类的具体用法?Java StructType怎么用?Java StructType使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


StructType类属于org.apache.spark.sql.types包,在下文中一共展示了StructType类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: createNGramDataFrame

import org.apache.spark.sql.types.StructType; //导入依赖的package包/类
/**
 * Creates a n-gram data frame from text lines.
 * @param lines
 * @return a n-gram data frame.
 */
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
	JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
		private static final long serialVersionUID = -4332903997027358601L;
		
		@Override
		public Row call(String line) throws Exception {
			return RowFactory.create(Arrays.asList(line.split("\\s+")));
		}
	});
	StructType schema = new StructType(new StructField[] {
			new StructField("words",
					DataTypes.createArrayType(DataTypes.StringType), false,
					Metadata.empty()) });
	DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
	// build a bigram language model
	NGram transformer = new NGram().setInputCol("words")
			.setOutputCol("ngrams").setN(2);
	DataFrame ngramDF = transformer.transform(wordDF);
	ngramDF.show(10, false);
	return ngramDF;
}
 
开发者ID:phuonglh,项目名称:vn.vitk,代码行数:27,代码来源:NGramBuilder.java

示例2: test_getDataSetResult

import org.apache.spark.sql.types.StructType; //导入依赖的package包/类
@Test
public void test_getDataSetResult() {

    StructField[] structFields = new StructField[]{
            new StructField("intColumn", DataTypes.IntegerType, true, Metadata.empty()),
            new StructField("stringColumn", DataTypes.StringType, true, Metadata.empty())
    };

    StructType structType = new StructType(structFields);

    List<Row> rows = new ArrayList<>();
    rows.add(RowFactory.create(1, "v1"));
    rows.add(RowFactory.create(2, "v2"));

    Dataset<Row> df = sparkSession.createDataFrame(rows, structType);

    DataSetResult dataSetResult = SparkUtils.getDataSetResult(df);
    Assert.assertEquals(2, dataSetResult.getColumnNames().size());
    Assert.assertEquals(2, dataSetResult.getRows().size());
    Assert.assertEquals(new Integer(1), dataSetResult.getRows().get(0).get(0));
    Assert.assertEquals("v1", dataSetResult.getRows().get(0).get(1));
    Assert.assertEquals(new Integer(2), dataSetResult.getRows().get(1).get(0));
    Assert.assertEquals("v2", dataSetResult.getRows().get(1).get(1));
}
 
开发者ID:uber,项目名称:uberscriptquery,代码行数:25,代码来源:SparkUtilsTest.java

示例3: generateData_week_timepoints_by_10_minutes

import org.apache.spark.sql.types.StructType; //导入依赖的package包/类
private static Dataset<Row> generateData_week_timepoints_by_10_minutes(SparkSession spark) {
    StructField[] structFields = new StructField[1];
    org.apache.spark.sql.types.DataType dataType = DataTypes.IntegerType;
    String column = "timepoint";
    StructField structField = new StructField(column, dataType, true, Metadata.empty());
    structFields[0] = structField;

    StructType structType = new StructType(structFields);

    List<Row> rows = new ArrayList<>();

    int weekTotalMinutes = 7 * 24 * 60;
    int timepointIntervalMinutes = 10;
    for (int i = 0; i < weekTotalMinutes / timepointIntervalMinutes; i++) {
        Object[] objects = new Object[structFields.length];
        objects[0] = i;
        Row row = RowFactory.create(objects);
        rows.add(row);
    }

    Dataset<Row> df = spark.createDataFrame(rows, structType);
    return df;
}
 
开发者ID:uber,项目名称:uberscriptquery,代码行数:24,代码来源:QueryEngine.java

示例4: parse

import org.apache.spark.sql.types.StructType; //导入依赖的package包/类
/**
 * Parses a list of PoS-tagged sentences, each on a line and writes the result to an output 
 * file in a specified output format.
 * @param jsc
 * @param sentences
 * @param outputFileName
 * @param outuptFormat
 */
public void parse(JavaSparkContext jsc, List<String> sentences, String outputFileName, OutputFormat outputFormat) {
	JavaRDD<String> input = jsc.parallelize(sentences);
	JavaRDD<Sentence> sents = input.map(new TaggedLineToSentenceFunction());
	JavaRDD<DependencyGraph> graphs = sents.map(new ParsingFunction());
	JavaRDD<Row> rows = graphs.map(new Function<DependencyGraph, Row>() {
		private static final long serialVersionUID = -812004521983071103L;
		public Row call(DependencyGraph graph) {
			return RowFactory.create(graph.getSentence().toString(), graph.dependencies());
		}
	});
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),	
		new StructField("dependency", DataTypes.StringType, false, Metadata.empty())
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame df = sqlContext.createDataFrame(rows, schema);
	
	if (outputFormat == OutputFormat.TEXT)  
		df.select("dependency").write().text(outputFileName);
	else 
		df.repartition(1).write().json(outputFileName);
}
 
开发者ID:phuonglh,项目名称:vn.vitk,代码行数:31,代码来源:DependencyParser.java

示例5: datasetSchema

import org.apache.spark.sql.types.StructType; //导入依赖的package包/类
static public StructType datasetSchema(Map<String, String> mappa) {
	StructType struct = new StructType();
	for (Map.Entry<String, String> entry : mappa.entrySet()) {
		switch (entry.getValue().toLowerCase()) {
		case "string":
		case "dictionary":
			struct = struct.add(entry.getKey(), DataTypes.StringType);
			break;
		case "int":
			struct = struct.add(entry.getKey(), DataTypes.IntegerType);
			break;
		case "double":
			struct = struct.add(entry.getKey(), DataTypes.DoubleType);
			break;
		}
	}
	return struct;
}
 
开发者ID:pfratta,项目名称:ParquetUtils,代码行数:19,代码来源:ParquetGeneratorEngine.java

示例6: fromSchema

import org.apache.spark.sql.types.StructType; //导入依赖的package包/类
/**
 * Convert a datavec schema to a
 * struct type in spark
 *
 * @param schema the schema to convert
 * @return the datavec struct type
 */
public static StructType fromSchema(Schema schema) {
    StructField[] structFields = new StructField[schema.numColumns()];
    for (int i = 0; i < structFields.length; i++) {
        switch (schema.getColumnTypes().get(i)) {
            case Double:
                structFields[i] = new StructField(schema.getName(i), DataTypes.DoubleType, false, Metadata.empty());
                break;
            case Integer:
                structFields[i] =
                                new StructField(schema.getName(i), DataTypes.IntegerType, false, Metadata.empty());
                break;
            case Long:
                structFields[i] = new StructField(schema.getName(i), DataTypes.LongType, false, Metadata.empty());
                break;
            case Float:
                structFields[i] = new StructField(schema.getName(i), DataTypes.FloatType, false, Metadata.empty());
                break;
            default:
                throw new IllegalStateException(
                                "This api should not be used with strings , binary data or ndarrays. This is only for columnar data");
        }
    }
    return new StructType(structFields);
}
 
开发者ID:deeplearning4j,项目名称:DataVec,代码行数:32,代码来源:DataFrames.java

示例7: generateData_numbers_1k

import org.apache.spark.sql.types.StructType; //导入依赖的package包/类
private static Dataset<Row> generateData_numbers_1k(SparkSession spark) {
    StructField[] structFields = new StructField[1];
    org.apache.spark.sql.types.DataType dataType = DataTypes.IntegerType;
    String column = "number";
    StructField structField = new StructField(column, dataType, true, Metadata.empty());
    structFields[0] = structField;

    StructType structType = new StructType(structFields);

    List<Row> rows = new ArrayList<>();

    for (int i = 0; i <= 1000; i++) {
        Object[] objects = new Object[structFields.length];
        objects[0] = i;
        Row row = RowFactory.create(objects);
        rows.add(row);
    }

    Dataset<Row> df = spark.createDataFrame(rows, structType);
    return df;
}
 
开发者ID:uber,项目名称:uberscriptquery,代码行数:22,代码来源:QueryEngine.java

示例8: writeEntityMetadata

import org.apache.spark.sql.types.StructType; //导入依赖的package包/类
/**
 * Write metadata describing entity tables
 *
 * @param entitySchema the entity schema
 */
public void writeEntityMetadata(EntitySchema entitySchema) {

    // create the schema
    List<StructField> fields = new ArrayList<>();
    fields.add(DataTypes.createStructField(ENTITIES_NAME, DataTypes.StringType, false));
    fields.add(DataTypes.createStructField(ENTITIES_URI, DataTypes.StringType, false));
    fields.add(DataTypes.createStructField(ENTITIES_LABEL, DataTypes.StringType, true));
    fields.add(DataTypes.createStructField(ENTITIES_NUM_ROWS, DataTypes.LongType, false));
    StructType schema = DataTypes.createStructType(fields);

    List<Tuple2<String, String>> indexes = new ArrayList<>();
    indexes.add(new Tuple2<>(ENTITIES_TABLE_NAME, ENTITIES_URI));

    List<Tuple2<String, String>> primaryKeys = new ArrayList<>();
    indexes.add(new Tuple2<>(ENTITIES_TABLE_NAME, ENTITIES_NAME));

    final Map<String, String> uriLabels = rdfSchema.getUriLabels();
    // create table rows
    List<Row> rows = entitySchema.getTables().stream()
            .map(table -> {
                Object[] valueArray = new Object[]{
                        table.getName(),
                        table.getTypeURI(),
                        uriLabels.get(table.getTypeURI()),
                        table.getNumRows()
                };
                return RowFactory.create(valueArray);
            }).collect(Collectors.toList());

    // create and write the META_Entities dataframe
    DataFrame df = sql.createDataFrame(rows, schema);
    persistor.writeDataFrame(ENTITIES_TABLE_NAME, df);
    persistor.createPrimaryKeys(primaryKeys);
    persistor.createIndexes(indexes);
    df.unpersist();
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:42,代码来源:MetadataWriter.java

示例9: schemaFor

import org.apache.spark.sql.types.StructType; //导入依赖的package包/类
private static Schema schemaFor(StructType structType, String record, String namespace, int recordCount) {

    // Increment after using the recordCount
    record = (null == record) ? "record" + recordCount++ : record;

    LOG.debug("Converting {} to Avro Record schema [{}:{}]", structType, record, namespace);
    SchemaBuilder.RecordBuilder<Schema> schema = SchemaBuilder.record(record);

    if (null != namespace) {
      schema.namespace(namespace);
    }

    schema.doc("Auto-generated from Spark DataFrame");

    SchemaBuilder.FieldAssembler<Schema> assembler = schema.fields();
    StructField[] structFields = structType.fields();

    for (StructField f : structFields) {
      assembler.name(f.name()).type(typeFor(f.dataType(), f.nullable(), recordCount)).noDefault();
    }

    return assembler.endRecord();
  }
 
开发者ID:cloudera-labs,项目名称:envelope,代码行数:24,代码来源:AvroUtils.java

示例10: coerceParsedTokens

import org.apache.spark.sql.types.StructType; //导入依赖的package包/类
private static Object[] coerceParsedTokens(
        String line, Object[] tokens, boolean safe,
        StructType schema, List<FastDateFormat> dateFormats)  throws Exception {

    Object[] result = new Object[tokens.length];
    for (int i = 0; i < tokens.length; i++) {
        try {
            tokens[i] = !schema.apply(i).dataType().simpleString().equalsIgnoreCase("String") ? tokens[i].toString().trim() : tokens[i];
            result[i] = TypeCast.inputValue(tokens[i].toString(), schema.apply(i).dataType(),
                    schema.apply(i).nullable(), "null", true, dateFormats.get(i));
        } catch (Exception exception) {
            result[i] = null;
            if (!safe) {
                throw new RuntimeException(getSafeMessage(tokens[i], i, schema) + "\n Line being parsed => " + line,exception);
            }
        }
    }
    tokens = result;
    return tokens;
}
 
开发者ID:capitalone,项目名称:Hydrograph,代码行数:21,代码来源:DelimitedAndFixedWidthHelper.java

示例11: fromSchemaSequence

import org.apache.spark.sql.types.StructType; //导入依赖的package包/类
/**
 * Convert the DataVec sequence schema to a StructType for Spark, for example for use in
 * {@link #toDataFrameSequence(Schema, JavaRDD)}}
 * <b>Note</b>: as per {@link #toDataFrameSequence(Schema, JavaRDD)}}, the StructType has two additional columns added to it:<br>
 * - Column 0: Sequence UUID (name: {@link #SEQUENCE_UUID_COLUMN}) - a UUID for the original sequence<br>
 * - Column 1: Sequence index (name: {@link #SEQUENCE_INDEX_COLUMN} - an index (integer, starting at 0) for the position
 * of this record in the original time series.<br>
 * These two columns are required if the data is to be converted back into a sequence at a later point, for example
 * using {@link #toRecordsSequence(DataRowsFacade)}
 *
 * @param schema Schema to convert
 * @return StructType for the schema
 */
public static StructType fromSchemaSequence(Schema schema) {
    StructField[] structFields = new StructField[schema.numColumns() + 2];

    structFields[0] = new StructField(SEQUENCE_UUID_COLUMN, DataTypes.StringType, false, Metadata.empty());
    structFields[1] = new StructField(SEQUENCE_INDEX_COLUMN, DataTypes.IntegerType, false, Metadata.empty());

    for (int i = 0; i < schema.numColumns(); i++) {
        switch (schema.getColumnTypes().get(i)) {
            case Double:
                structFields[i + 2] =
                                new StructField(schema.getName(i), DataTypes.DoubleType, false, Metadata.empty());
                break;
            case Integer:
                structFields[i + 2] =
                                new StructField(schema.getName(i), DataTypes.IntegerType, false, Metadata.empty());
                break;
            case Long:
                structFields[i + 2] =
                                new StructField(schema.getName(i), DataTypes.LongType, false, Metadata.empty());
                break;
            case Float:
                structFields[i + 2] =
                                new StructField(schema.getName(i), DataTypes.FloatType, false, Metadata.empty());
                break;
            default:
                throw new IllegalStateException(
                                "This api should not be used with strings , binary data or ndarrays. This is only for columnar data");
        }
    }
    return new StructType(structFields);
}
 
开发者ID:deeplearning4j,项目名称:DataVec,代码行数:45,代码来源:DataFrames.java

示例12: createRelation

import org.apache.spark.sql.types.StructType; //导入依赖的package包/类
@Override
public SparkRDF4JSparqlRelation createRelation(SQLContext sqlContext,
		scala.collection.immutable.Map<String, String> scalaParameters, StructType schema) {
	Map<String, String> parameters = JavaConversions.asJavaMap(scalaParameters);
	String service = Optional.ofNullable(parameters.get("service")).orElseThrow(() -> new RuntimeException(
			"Spark RDF4J Sparql requires a SPARQL 'service' to be specified in the parameters"));
	String query = Optional.ofNullable(parameters.get("query")).orElseThrow(() -> new RuntimeException(
			"Spark RDF4J Sparql requires a 'query' to be specified in the parameters"));

	try {
		ParsedQuery parsedQuery = QueryParserUtil.parseQuery(QueryLanguage.SPARQL, query, null);
		if(!(parsedQuery instanceof ParsedTupleQuery)) {
			throw new RuntimeException("Spark RDF4J can only be used with Tuple (Select) queries right now.");
		}
		return new SparkRDF4JSparqlRelation(service, parsedQuery, schema, sqlContext);
	} catch (MalformedQueryException e) {
		throw new RuntimeException("Query was not valid SPARQL", e);
	}

}
 
开发者ID:ansell,项目名称:spark-rdf4j,代码行数:21,代码来源:SparkRDF4JDefaultSource.java

示例13: call

import org.apache.spark.sql.types.StructType; //导入依赖的package包/类
@Override
public void call(JavaRDD<String> rdd) throws Exception {

	JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
		private static final long serialVersionUID = 5167089361335095997L;

		@Override
		public Row call(String msg) {
			Row row = RowFactory.create(msg);
			return row;
		}
	});
	// Create Schema
	StructType schema = DataTypes.createStructType(
			new StructField[] { DataTypes.createStructField("Message", DataTypes.StringType, true) });

	// Get Spark 2.0 session
	SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
	Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
	msgDataFrame.show();
}
 
开发者ID:jgperrin,项目名称:net.jgp.labs.spark,代码行数:22,代码来源:RowProcessor.java

示例14: testPruneByStepValueFalse

import org.apache.spark.sql.types.StructType; //导入依赖的package包/类
@Test
public void testPruneByStepValueFalse() {
  StructType schema = new StructType(new StructField[] {
      new StructField("outcome", DataTypes.BooleanType, false, Metadata.empty())
  });
  List<Row> rows = Lists.newArrayList(
      RowFactory.create(false)
  );
  Dataset<Row> ds = Contexts.getSparkSession().createDataFrame(rows, schema);
  step1.setData(ds);
  
  Map<String, Object> step2ConfigMap = Maps.newHashMap();
  step2ConfigMap.put("dependencies", Lists.newArrayList("step1"));
  step2ConfigMap.put(DecisionStep.IF_TRUE_STEP_NAMES_PROPERTY, Lists.newArrayList("step3", "step7"));
  step2ConfigMap.put(DecisionStep.DECISION_METHOD_PROPERTY, DecisionStep.STEP_BY_VALUE_DECISION_METHOD);
  step2ConfigMap.put(DecisionStep.STEP_BY_VALUE_STEP_PROPERTY, "step1");
  Config step2Config = ConfigFactory.parseMap(step2ConfigMap);
  RefactorStep step2 = new DecisionStep("step2", step2Config);
  steps.add(step2);
  
  Set<Step> refactored = step2.refactor(steps);
  
  assertEquals(refactored, Sets.newHashSet(step1, step2, step5, step6));
}
 
开发者ID:cloudera-labs,项目名称:envelope,代码行数:25,代码来源:TestDecisionStep.java

示例15: create

import org.apache.spark.sql.types.StructType; //导入依赖的package包/类
public static BaseOrdering create(StructType schema) {
    final DataType[] dataTypes = new DataType[schema.size()];
    for (int i = 0; i < schema.size(); i++) {
        dataTypes[i] = schema.get(i).dataType;
    }
    return new BaseOrdering() {
        @Override
        public int compare(InternalRow a, InternalRow b) {
            assert a.numFields() == b.numFields() && a.numFields() == schema.size();
            int res;
            for (int i = 0; i < dataTypes.length; i++) {
                if ((res = dataTypes[i].comparator.compare(a.getUniformVal(i), b.getUniformVal(i))) != 0) {
                    return res;
                }
            }
            return 0;
        }
    };
}
 
开发者ID:shunfei,项目名称:indexr,代码行数:20,代码来源:BaseOrdering.java


注:本文中的org.apache.spark.sql.types.StructType类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。