当前位置: 首页>>代码示例>>Java>>正文


Java RowFactory类代码示例

本文整理汇总了Java中org.apache.spark.sql.RowFactory的典型用法代码示例。如果您正苦于以下问题:Java RowFactory类的具体用法?Java RowFactory怎么用?Java RowFactory使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


RowFactory类属于org.apache.spark.sql包,在下文中一共展示了RowFactory类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: createNGramDataFrame

import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
/**
 * Creates a n-gram data frame from text lines.
 * @param lines
 * @return a n-gram data frame.
 */
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
	JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
		private static final long serialVersionUID = -4332903997027358601L;
		
		@Override
		public Row call(String line) throws Exception {
			return RowFactory.create(Arrays.asList(line.split("\\s+")));
		}
	});
	StructType schema = new StructType(new StructField[] {
			new StructField("words",
					DataTypes.createArrayType(DataTypes.StringType), false,
					Metadata.empty()) });
	DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
	// build a bigram language model
	NGram transformer = new NGram().setInputCol("words")
			.setOutputCol("ngrams").setN(2);
	DataFrame ngramDF = transformer.transform(wordDF);
	ngramDF.show(10, false);
	return ngramDF;
}
 
开发者ID:phuonglh,项目名称:vn.vitk,代码行数:27,代码来源:NGramBuilder.java

示例2: test_getDataSetResult

import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
@Test
public void test_getDataSetResult() {

    StructField[] structFields = new StructField[]{
            new StructField("intColumn", DataTypes.IntegerType, true, Metadata.empty()),
            new StructField("stringColumn", DataTypes.StringType, true, Metadata.empty())
    };

    StructType structType = new StructType(structFields);

    List<Row> rows = new ArrayList<>();
    rows.add(RowFactory.create(1, "v1"));
    rows.add(RowFactory.create(2, "v2"));

    Dataset<Row> df = sparkSession.createDataFrame(rows, structType);

    DataSetResult dataSetResult = SparkUtils.getDataSetResult(df);
    Assert.assertEquals(2, dataSetResult.getColumnNames().size());
    Assert.assertEquals(2, dataSetResult.getRows().size());
    Assert.assertEquals(new Integer(1), dataSetResult.getRows().get(0).get(0));
    Assert.assertEquals("v1", dataSetResult.getRows().get(0).get(1));
    Assert.assertEquals(new Integer(2), dataSetResult.getRows().get(1).get(0));
    Assert.assertEquals("v2", dataSetResult.getRows().get(1).get(1));
}
 
开发者ID:uber,项目名称:uberscriptquery,代码行数:25,代码来源:SparkUtilsTest.java

示例3: generateData_week_timepoints_by_10_minutes

import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
private static Dataset<Row> generateData_week_timepoints_by_10_minutes(SparkSession spark) {
    StructField[] structFields = new StructField[1];
    org.apache.spark.sql.types.DataType dataType = DataTypes.IntegerType;
    String column = "timepoint";
    StructField structField = new StructField(column, dataType, true, Metadata.empty());
    structFields[0] = structField;

    StructType structType = new StructType(structFields);

    List<Row> rows = new ArrayList<>();

    int weekTotalMinutes = 7 * 24 * 60;
    int timepointIntervalMinutes = 10;
    for (int i = 0; i < weekTotalMinutes / timepointIntervalMinutes; i++) {
        Object[] objects = new Object[structFields.length];
        objects[0] = i;
        Row row = RowFactory.create(objects);
        rows.add(row);
    }

    Dataset<Row> df = spark.createDataFrame(rows, structType);
    return df;
}
 
开发者ID:uber,项目名称:uberscriptquery,代码行数:24,代码来源:QueryEngine.java

示例4: parse

import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
/**
 * Parses a list of PoS-tagged sentences, each on a line and writes the result to an output 
 * file in a specified output format.
 * @param jsc
 * @param sentences
 * @param outputFileName
 * @param outuptFormat
 */
public void parse(JavaSparkContext jsc, List<String> sentences, String outputFileName, OutputFormat outputFormat) {
	JavaRDD<String> input = jsc.parallelize(sentences);
	JavaRDD<Sentence> sents = input.map(new TaggedLineToSentenceFunction());
	JavaRDD<DependencyGraph> graphs = sents.map(new ParsingFunction());
	JavaRDD<Row> rows = graphs.map(new Function<DependencyGraph, Row>() {
		private static final long serialVersionUID = -812004521983071103L;
		public Row call(DependencyGraph graph) {
			return RowFactory.create(graph.getSentence().toString(), graph.dependencies());
		}
	});
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),	
		new StructField("dependency", DataTypes.StringType, false, Metadata.empty())
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame df = sqlContext.createDataFrame(rows, schema);
	
	if (outputFormat == OutputFormat.TEXT)  
		df.select("dependency").write().text(outputFileName);
	else 
		df.repartition(1).write().json(outputFileName);
}
 
开发者ID:phuonglh,项目名称:vn.vitk,代码行数:31,代码来源:DependencyParser.java

示例5: generateData_numbers_1k

import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
private static Dataset<Row> generateData_numbers_1k(SparkSession spark) {
    StructField[] structFields = new StructField[1];
    org.apache.spark.sql.types.DataType dataType = DataTypes.IntegerType;
    String column = "number";
    StructField structField = new StructField(column, dataType, true, Metadata.empty());
    structFields[0] = structField;

    StructType structType = new StructType(structFields);

    List<Row> rows = new ArrayList<>();

    for (int i = 0; i <= 1000; i++) {
        Object[] objects = new Object[structFields.length];
        objects[0] = i;
        Row row = RowFactory.create(objects);
        rows.add(row);
    }

    Dataset<Row> df = spark.createDataFrame(rows, structType);
    return df;
}
 
开发者ID:uber,项目名称:uberscriptquery,代码行数:22,代码来源:QueryEngine.java

示例6: writeEntityMetadata

import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
/**
 * Write metadata describing entity tables
 *
 * @param entitySchema the entity schema
 */
public void writeEntityMetadata(EntitySchema entitySchema) {

    // create the schema
    List<StructField> fields = new ArrayList<>();
    fields.add(DataTypes.createStructField(ENTITIES_NAME, DataTypes.StringType, false));
    fields.add(DataTypes.createStructField(ENTITIES_URI, DataTypes.StringType, false));
    fields.add(DataTypes.createStructField(ENTITIES_LABEL, DataTypes.StringType, true));
    fields.add(DataTypes.createStructField(ENTITIES_NUM_ROWS, DataTypes.LongType, false));
    StructType schema = DataTypes.createStructType(fields);

    List<Tuple2<String, String>> indexes = new ArrayList<>();
    indexes.add(new Tuple2<>(ENTITIES_TABLE_NAME, ENTITIES_URI));

    List<Tuple2<String, String>> primaryKeys = new ArrayList<>();
    indexes.add(new Tuple2<>(ENTITIES_TABLE_NAME, ENTITIES_NAME));

    final Map<String, String> uriLabels = rdfSchema.getUriLabels();
    // create table rows
    List<Row> rows = entitySchema.getTables().stream()
            .map(table -> {
                Object[] valueArray = new Object[]{
                        table.getName(),
                        table.getTypeURI(),
                        uriLabels.get(table.getTypeURI()),
                        table.getNumRows()
                };
                return RowFactory.create(valueArray);
            }).collect(Collectors.toList());

    // create and write the META_Entities dataframe
    DataFrame df = sql.createDataFrame(rows, schema);
    persistor.writeDataFrame(ENTITIES_TABLE_NAME, df);
    persistor.createPrimaryKeys(primaryKeys);
    persistor.createIndexes(indexes);
    df.unpersist();
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:42,代码来源:MetadataWriter.java

示例7: call

import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
@Override
public Row call(String line) throws Exception {
    String[] strArr;
    if (StringUtils.isEmpty(dataFile.getDelim())) {
        strArr = new String[]{line};
    } else {
        strArr = line.split(dataFile.getDelim());
    }

    List<FieldInfo> fieldInfos = dataFile.getFieldInfos();
    Object[] objs = new Object[fieldInfos.size()];
    for (int i = 0; i < fieldInfos.size(); i++) {
        FieldInfo fieldInfo = fieldInfos.get(i);
        //单列
        if (fieldInfo.getIndex() != -1) {
            objs[i] = fieldCall(fieldInfo, strArr[i]);
            //多列
        } else {
            int tmpSize = fieldInfo.getEndIndex() - fieldInfo.getStartIndex() + 1;
            String[] tmp = new String[tmpSize];
            System.arraycopy(strArr, fieldInfo.getStartIndex(), tmp, 0, tmpSize);
            objs[i] = fieldCall(fieldInfo, tmp);
        }
    }
    return RowFactory.create(objs);
}
 
开发者ID:hays2hong,项目名称:stonk,代码行数:27,代码来源:LineParse.java

示例8: getRelatedTypeIDs

import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
/**
 * Map a {@link Instance} into an Iterator of all of its relations
 * represented as rows of (related URI, predicate index, type index, instance ID)
 *
 * @param instance the requested {@link Instance}
 * @return an Iterator of all of its relations represented as rows of (related URI, predicate index, type index, instance ID)
 */
private Iterable<Row> getRelatedTypeIDs(Instance instance) {
    // typeIDs representing references to the instance in each table (or a single one, if instance has a single type)
    final Long id = instance.getId();

    final List<Tuple2<Integer, Long>> instanceTypeIDs = getRelationEntityTypes(instance)
            .map(typeIndex -> new Tuple2<>(typeIndex, id))
            .collect(Collectors.toList());

    return instance.getRelations().stream()
            .flatMap(relation ->
                    instanceTypeIDs.stream()
                            .map(instanceTypeID -> RowFactory.create(
                                    relation.getObjectURI(),
                                    relation.getPredicateIndex(),
                                    instanceTypeID._1(),
                                    instanceTypeID._2()
                            ))
            ).collect(Collectors.toList());
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:27,代码来源:RelationExtractor.java

示例9: call

import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
@Override
public Row call(Row row) throws Exception {
	List<String> words = Arrays.asList(row.getString(0).split("\\s+"));
	int n = words.size();
	List<Tuple2<String, String>> sequence = new ArrayList<Tuple2<String, String>>(n);
	for (int i = 0; i < n; i++) {
		sequence.add(new Tuple2<String, String>(words.get(i), "UNK"));
	}
	List<String> partsOfSpeech = decode(sequence);
	StringBuilder sb = new StringBuilder();
	for (String pos : partsOfSpeech) {
		sb.append(pos);
		sb.append(' ');
	}
	return RowFactory.create(row.getString(0), sb.toString().trim());
}
 
开发者ID:phuonglh,项目名称:vn.vitk,代码行数:17,代码来源:CMMModel.java

示例10: call

import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
@Override
public Row call(Tuple2<Iterable<Row>, Iterable<Row>> cogrouped) throws Exception {
  // There should only be one 'into' record per key
  Row intoRow = cogrouped._1().iterator().next();
  Row[] fromRows = Iterables.toArray(cogrouped._2(), Row.class);
  int intoRowNumFields = intoRow.size();

  Object[] nestedValues = new Object[intoRowNumFields + 1];
  for (int i = 0; i < intoRowNumFields; i++) {
    nestedValues[i] = intoRow.get(i);
  }
  nestedValues[intoRowNumFields] = fromRows;

  Row nested = RowFactory.create(nestedValues);

  return nested;
}
 
开发者ID:cloudera-labs,项目名称:envelope,代码行数:18,代码来源:NestDeriver.java

示例11: rowForRecord

import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
private static Row rowForRecord(GenericRecord record) {
  List<Object> values = Lists.newArrayList();

  for (Field field : record.getSchema().getFields()) {
    Object value = record.get(field.name());

    Type fieldType = field.schema().getType();
    if (fieldType.equals(Type.UNION)) {
      fieldType = field.schema().getTypes().get(1).getType();
    }
    // Avro returns Utf8s for strings, which Spark SQL doesn't know how to use
    if (fieldType.equals(Type.STRING) && value != null) {
      value = value.toString();
    }

    values.add(value);
  }

  return RowFactory.create(values.toArray());
}
 
开发者ID:cloudera-labs,项目名称:envelope,代码行数:21,代码来源:AvroTranslator.java

示例12: readTextWithoutTranslator

import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
@Test
public void readTextWithoutTranslator() throws Exception {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put(FileSystemInput.FORMAT_CONFIG, FileSystemInput.TEXT_FORMAT);
  configMap.put(FileSystemInput.PATH_CONFIG, FileSystemInput.class.getResource(TEXT_DATA).getPath());
  config = ConfigFactory.parseMap(configMap);
  
  FileSystemInput formatInput = new FileSystemInput();
  formatInput.configure(config);
  
  Dataset<Row> results = formatInput.read();
  
  assertEquals(2, results.count());
  assertTrue(results.collectAsList().contains(RowFactory.create("a=1,b=hello,c=true")));
  assertTrue(results.collectAsList().contains(RowFactory.create("a=2,b=world,c=false")));
}
 
开发者ID:cloudera-labs,项目名称:envelope,代码行数:17,代码来源:TestFileSystemInput.java

示例13: readTextWithTranslator

import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
@Test
public void readTextWithTranslator() throws Exception {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put(FileSystemInput.FORMAT_CONFIG, FileSystemInput.TEXT_FORMAT);
  configMap.put(FileSystemInput.PATH_CONFIG, FileSystemInput.class.getResource(TEXT_DATA).getPath());
  configMap.put("translator.type", KVPTranslator.class.getName());
  configMap.put("translator.delimiter.kvp", ",");
  configMap.put("translator.delimiter.field", "=");
  configMap.put("translator.field.names", Lists.newArrayList("a", "b", "c"));
  configMap.put("translator.field.types", Lists.newArrayList("int", "string", "boolean"));
  config = ConfigFactory.parseMap(configMap);
  
  FileSystemInput formatInput = new FileSystemInput();
  formatInput.configure(config);
  
  Dataset<Row> results = formatInput.read();
  
  assertEquals(2, results.count());
  assertTrue(results.collectAsList().contains(RowFactory.create(1, "hello", true)));
  assertTrue(results.collectAsList().contains(RowFactory.create(2, "world", false)));
}
 
开发者ID:cloudera-labs,项目名称:envelope,代码行数:22,代码来源:TestFileSystemInput.java

示例14: testPlanner

import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
@Test
public void testPlanner() {
  List<Row> rows = Lists.newArrayList(RowFactory.create("a", 1, false), RowFactory.create("b", 2, true));
  StructType schema = new StructType(new StructField[] {
      new StructField("field1", DataTypes.StringType, false, null),
      new StructField("field2", DataTypes.IntegerType, false, null),
      new StructField("field3", DataTypes.BooleanType, false, null)
  });
  
  Dataset<Row> data = Contexts.getSparkSession().createDataFrame(rows, schema);
  
  BulkPlanner p = new DeletePlanner();
  p.configure(ConfigFactory.empty());
  
  List<Tuple2<MutationType, Dataset<Row>>> planned = p.planMutationsForSet(data);
  
  assertEquals(1, planned.size());
  assertEquals(MutationType.DELETE, planned.get(0)._1());
  assertEquals(data, planned.get(0)._2());
}
 
开发者ID:cloudera-labs,项目名称:envelope,代码行数:21,代码来源:TestDeletePlanner.java

示例15: testPruneByStepValueTrue

import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
@Test
public void testPruneByStepValueTrue() {
  StructType schema = new StructType(new StructField[] {
      new StructField("outcome", DataTypes.BooleanType, false, Metadata.empty())
  });
  List<Row> rows = Lists.newArrayList(
      RowFactory.create(true)
  );
  Dataset<Row> ds = Contexts.getSparkSession().createDataFrame(rows, schema);
  step1.setData(ds);
  
  Map<String, Object> step2ConfigMap = Maps.newHashMap();
  step2ConfigMap.put("dependencies", Lists.newArrayList("step1"));
  step2ConfigMap.put(DecisionStep.IF_TRUE_STEP_NAMES_PROPERTY, Lists.newArrayList("step3", "step7"));
  step2ConfigMap.put(DecisionStep.DECISION_METHOD_PROPERTY, DecisionStep.STEP_BY_VALUE_DECISION_METHOD);
  step2ConfigMap.put(DecisionStep.STEP_BY_VALUE_STEP_PROPERTY, "step1");
  Config step2Config = ConfigFactory.parseMap(step2ConfigMap);
  RefactorStep step2 = new DecisionStep("step2", step2Config);
  steps.add(step2);
  
  Set<Step> refactored = step2.refactor(steps);
  
  assertEquals(refactored, Sets.newHashSet(step1, step2, step3, step4, step7, step8));
}
 
开发者ID:cloudera-labs,项目名称:envelope,代码行数:25,代码来源:TestDecisionStep.java


注:本文中的org.apache.spark.sql.RowFactory类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。