本文整理汇总了Java中org.apache.spark.sql.RowFactory类的典型用法代码示例。如果您正苦于以下问题:Java RowFactory类的具体用法?Java RowFactory怎么用?Java RowFactory使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
RowFactory类属于org.apache.spark.sql包,在下文中一共展示了RowFactory类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: createNGramDataFrame
import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
/**
* Creates a n-gram data frame from text lines.
* @param lines
* @return a n-gram data frame.
*/
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
private static final long serialVersionUID = -4332903997027358601L;
@Override
public Row call(String line) throws Exception {
return RowFactory.create(Arrays.asList(line.split("\\s+")));
}
});
StructType schema = new StructType(new StructField[] {
new StructField("words",
DataTypes.createArrayType(DataTypes.StringType), false,
Metadata.empty()) });
DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
// build a bigram language model
NGram transformer = new NGram().setInputCol("words")
.setOutputCol("ngrams").setN(2);
DataFrame ngramDF = transformer.transform(wordDF);
ngramDF.show(10, false);
return ngramDF;
}
示例2: test_getDataSetResult
import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
@Test
public void test_getDataSetResult() {
StructField[] structFields = new StructField[]{
new StructField("intColumn", DataTypes.IntegerType, true, Metadata.empty()),
new StructField("stringColumn", DataTypes.StringType, true, Metadata.empty())
};
StructType structType = new StructType(structFields);
List<Row> rows = new ArrayList<>();
rows.add(RowFactory.create(1, "v1"));
rows.add(RowFactory.create(2, "v2"));
Dataset<Row> df = sparkSession.createDataFrame(rows, structType);
DataSetResult dataSetResult = SparkUtils.getDataSetResult(df);
Assert.assertEquals(2, dataSetResult.getColumnNames().size());
Assert.assertEquals(2, dataSetResult.getRows().size());
Assert.assertEquals(new Integer(1), dataSetResult.getRows().get(0).get(0));
Assert.assertEquals("v1", dataSetResult.getRows().get(0).get(1));
Assert.assertEquals(new Integer(2), dataSetResult.getRows().get(1).get(0));
Assert.assertEquals("v2", dataSetResult.getRows().get(1).get(1));
}
示例3: generateData_week_timepoints_by_10_minutes
import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
private static Dataset<Row> generateData_week_timepoints_by_10_minutes(SparkSession spark) {
StructField[] structFields = new StructField[1];
org.apache.spark.sql.types.DataType dataType = DataTypes.IntegerType;
String column = "timepoint";
StructField structField = new StructField(column, dataType, true, Metadata.empty());
structFields[0] = structField;
StructType structType = new StructType(structFields);
List<Row> rows = new ArrayList<>();
int weekTotalMinutes = 7 * 24 * 60;
int timepointIntervalMinutes = 10;
for (int i = 0; i < weekTotalMinutes / timepointIntervalMinutes; i++) {
Object[] objects = new Object[structFields.length];
objects[0] = i;
Row row = RowFactory.create(objects);
rows.add(row);
}
Dataset<Row> df = spark.createDataFrame(rows, structType);
return df;
}
示例4: parse
import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
/**
* Parses a list of PoS-tagged sentences, each on a line and writes the result to an output
* file in a specified output format.
* @param jsc
* @param sentences
* @param outputFileName
* @param outuptFormat
*/
public void parse(JavaSparkContext jsc, List<String> sentences, String outputFileName, OutputFormat outputFormat) {
JavaRDD<String> input = jsc.parallelize(sentences);
JavaRDD<Sentence> sents = input.map(new TaggedLineToSentenceFunction());
JavaRDD<DependencyGraph> graphs = sents.map(new ParsingFunction());
JavaRDD<Row> rows = graphs.map(new Function<DependencyGraph, Row>() {
private static final long serialVersionUID = -812004521983071103L;
public Row call(DependencyGraph graph) {
return RowFactory.create(graph.getSentence().toString(), graph.dependencies());
}
});
StructType schema = new StructType(new StructField[]{
new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),
new StructField("dependency", DataTypes.StringType, false, Metadata.empty())
});
SQLContext sqlContext = new SQLContext(jsc);
DataFrame df = sqlContext.createDataFrame(rows, schema);
if (outputFormat == OutputFormat.TEXT)
df.select("dependency").write().text(outputFileName);
else
df.repartition(1).write().json(outputFileName);
}
示例5: generateData_numbers_1k
import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
private static Dataset<Row> generateData_numbers_1k(SparkSession spark) {
StructField[] structFields = new StructField[1];
org.apache.spark.sql.types.DataType dataType = DataTypes.IntegerType;
String column = "number";
StructField structField = new StructField(column, dataType, true, Metadata.empty());
structFields[0] = structField;
StructType structType = new StructType(structFields);
List<Row> rows = new ArrayList<>();
for (int i = 0; i <= 1000; i++) {
Object[] objects = new Object[structFields.length];
objects[0] = i;
Row row = RowFactory.create(objects);
rows.add(row);
}
Dataset<Row> df = spark.createDataFrame(rows, structType);
return df;
}
示例6: writeEntityMetadata
import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
/**
* Write metadata describing entity tables
*
* @param entitySchema the entity schema
*/
public void writeEntityMetadata(EntitySchema entitySchema) {
// create the schema
List<StructField> fields = new ArrayList<>();
fields.add(DataTypes.createStructField(ENTITIES_NAME, DataTypes.StringType, false));
fields.add(DataTypes.createStructField(ENTITIES_URI, DataTypes.StringType, false));
fields.add(DataTypes.createStructField(ENTITIES_LABEL, DataTypes.StringType, true));
fields.add(DataTypes.createStructField(ENTITIES_NUM_ROWS, DataTypes.LongType, false));
StructType schema = DataTypes.createStructType(fields);
List<Tuple2<String, String>> indexes = new ArrayList<>();
indexes.add(new Tuple2<>(ENTITIES_TABLE_NAME, ENTITIES_URI));
List<Tuple2<String, String>> primaryKeys = new ArrayList<>();
indexes.add(new Tuple2<>(ENTITIES_TABLE_NAME, ENTITIES_NAME));
final Map<String, String> uriLabels = rdfSchema.getUriLabels();
// create table rows
List<Row> rows = entitySchema.getTables().stream()
.map(table -> {
Object[] valueArray = new Object[]{
table.getName(),
table.getTypeURI(),
uriLabels.get(table.getTypeURI()),
table.getNumRows()
};
return RowFactory.create(valueArray);
}).collect(Collectors.toList());
// create and write the META_Entities dataframe
DataFrame df = sql.createDataFrame(rows, schema);
persistor.writeDataFrame(ENTITIES_TABLE_NAME, df);
persistor.createPrimaryKeys(primaryKeys);
persistor.createIndexes(indexes);
df.unpersist();
}
示例7: call
import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
@Override
public Row call(String line) throws Exception {
String[] strArr;
if (StringUtils.isEmpty(dataFile.getDelim())) {
strArr = new String[]{line};
} else {
strArr = line.split(dataFile.getDelim());
}
List<FieldInfo> fieldInfos = dataFile.getFieldInfos();
Object[] objs = new Object[fieldInfos.size()];
for (int i = 0; i < fieldInfos.size(); i++) {
FieldInfo fieldInfo = fieldInfos.get(i);
//单列
if (fieldInfo.getIndex() != -1) {
objs[i] = fieldCall(fieldInfo, strArr[i]);
//多列
} else {
int tmpSize = fieldInfo.getEndIndex() - fieldInfo.getStartIndex() + 1;
String[] tmp = new String[tmpSize];
System.arraycopy(strArr, fieldInfo.getStartIndex(), tmp, 0, tmpSize);
objs[i] = fieldCall(fieldInfo, tmp);
}
}
return RowFactory.create(objs);
}
示例8: getRelatedTypeIDs
import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
/**
* Map a {@link Instance} into an Iterator of all of its relations
* represented as rows of (related URI, predicate index, type index, instance ID)
*
* @param instance the requested {@link Instance}
* @return an Iterator of all of its relations represented as rows of (related URI, predicate index, type index, instance ID)
*/
private Iterable<Row> getRelatedTypeIDs(Instance instance) {
// typeIDs representing references to the instance in each table (or a single one, if instance has a single type)
final Long id = instance.getId();
final List<Tuple2<Integer, Long>> instanceTypeIDs = getRelationEntityTypes(instance)
.map(typeIndex -> new Tuple2<>(typeIndex, id))
.collect(Collectors.toList());
return instance.getRelations().stream()
.flatMap(relation ->
instanceTypeIDs.stream()
.map(instanceTypeID -> RowFactory.create(
relation.getObjectURI(),
relation.getPredicateIndex(),
instanceTypeID._1(),
instanceTypeID._2()
))
).collect(Collectors.toList());
}
示例9: call
import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
@Override
public Row call(Row row) throws Exception {
List<String> words = Arrays.asList(row.getString(0).split("\\s+"));
int n = words.size();
List<Tuple2<String, String>> sequence = new ArrayList<Tuple2<String, String>>(n);
for (int i = 0; i < n; i++) {
sequence.add(new Tuple2<String, String>(words.get(i), "UNK"));
}
List<String> partsOfSpeech = decode(sequence);
StringBuilder sb = new StringBuilder();
for (String pos : partsOfSpeech) {
sb.append(pos);
sb.append(' ');
}
return RowFactory.create(row.getString(0), sb.toString().trim());
}
示例10: call
import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
@Override
public Row call(Tuple2<Iterable<Row>, Iterable<Row>> cogrouped) throws Exception {
// There should only be one 'into' record per key
Row intoRow = cogrouped._1().iterator().next();
Row[] fromRows = Iterables.toArray(cogrouped._2(), Row.class);
int intoRowNumFields = intoRow.size();
Object[] nestedValues = new Object[intoRowNumFields + 1];
for (int i = 0; i < intoRowNumFields; i++) {
nestedValues[i] = intoRow.get(i);
}
nestedValues[intoRowNumFields] = fromRows;
Row nested = RowFactory.create(nestedValues);
return nested;
}
示例11: rowForRecord
import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
private static Row rowForRecord(GenericRecord record) {
List<Object> values = Lists.newArrayList();
for (Field field : record.getSchema().getFields()) {
Object value = record.get(field.name());
Type fieldType = field.schema().getType();
if (fieldType.equals(Type.UNION)) {
fieldType = field.schema().getTypes().get(1).getType();
}
// Avro returns Utf8s for strings, which Spark SQL doesn't know how to use
if (fieldType.equals(Type.STRING) && value != null) {
value = value.toString();
}
values.add(value);
}
return RowFactory.create(values.toArray());
}
示例12: readTextWithoutTranslator
import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
@Test
public void readTextWithoutTranslator() throws Exception {
Map<String, Object> configMap = Maps.newHashMap();
configMap.put(FileSystemInput.FORMAT_CONFIG, FileSystemInput.TEXT_FORMAT);
configMap.put(FileSystemInput.PATH_CONFIG, FileSystemInput.class.getResource(TEXT_DATA).getPath());
config = ConfigFactory.parseMap(configMap);
FileSystemInput formatInput = new FileSystemInput();
formatInput.configure(config);
Dataset<Row> results = formatInput.read();
assertEquals(2, results.count());
assertTrue(results.collectAsList().contains(RowFactory.create("a=1,b=hello,c=true")));
assertTrue(results.collectAsList().contains(RowFactory.create("a=2,b=world,c=false")));
}
示例13: readTextWithTranslator
import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
@Test
public void readTextWithTranslator() throws Exception {
Map<String, Object> configMap = Maps.newHashMap();
configMap.put(FileSystemInput.FORMAT_CONFIG, FileSystemInput.TEXT_FORMAT);
configMap.put(FileSystemInput.PATH_CONFIG, FileSystemInput.class.getResource(TEXT_DATA).getPath());
configMap.put("translator.type", KVPTranslator.class.getName());
configMap.put("translator.delimiter.kvp", ",");
configMap.put("translator.delimiter.field", "=");
configMap.put("translator.field.names", Lists.newArrayList("a", "b", "c"));
configMap.put("translator.field.types", Lists.newArrayList("int", "string", "boolean"));
config = ConfigFactory.parseMap(configMap);
FileSystemInput formatInput = new FileSystemInput();
formatInput.configure(config);
Dataset<Row> results = formatInput.read();
assertEquals(2, results.count());
assertTrue(results.collectAsList().contains(RowFactory.create(1, "hello", true)));
assertTrue(results.collectAsList().contains(RowFactory.create(2, "world", false)));
}
示例14: testPlanner
import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
@Test
public void testPlanner() {
List<Row> rows = Lists.newArrayList(RowFactory.create("a", 1, false), RowFactory.create("b", 2, true));
StructType schema = new StructType(new StructField[] {
new StructField("field1", DataTypes.StringType, false, null),
new StructField("field2", DataTypes.IntegerType, false, null),
new StructField("field3", DataTypes.BooleanType, false, null)
});
Dataset<Row> data = Contexts.getSparkSession().createDataFrame(rows, schema);
BulkPlanner p = new DeletePlanner();
p.configure(ConfigFactory.empty());
List<Tuple2<MutationType, Dataset<Row>>> planned = p.planMutationsForSet(data);
assertEquals(1, planned.size());
assertEquals(MutationType.DELETE, planned.get(0)._1());
assertEquals(data, planned.get(0)._2());
}
示例15: testPruneByStepValueTrue
import org.apache.spark.sql.RowFactory; //导入依赖的package包/类
@Test
public void testPruneByStepValueTrue() {
StructType schema = new StructType(new StructField[] {
new StructField("outcome", DataTypes.BooleanType, false, Metadata.empty())
});
List<Row> rows = Lists.newArrayList(
RowFactory.create(true)
);
Dataset<Row> ds = Contexts.getSparkSession().createDataFrame(rows, schema);
step1.setData(ds);
Map<String, Object> step2ConfigMap = Maps.newHashMap();
step2ConfigMap.put("dependencies", Lists.newArrayList("step1"));
step2ConfigMap.put(DecisionStep.IF_TRUE_STEP_NAMES_PROPERTY, Lists.newArrayList("step3", "step7"));
step2ConfigMap.put(DecisionStep.DECISION_METHOD_PROPERTY, DecisionStep.STEP_BY_VALUE_DECISION_METHOD);
step2ConfigMap.put(DecisionStep.STEP_BY_VALUE_STEP_PROPERTY, "step1");
Config step2Config = ConfigFactory.parseMap(step2ConfigMap);
RefactorStep step2 = new DecisionStep("step2", step2Config);
steps.add(step2);
Set<Step> refactored = step2.refactor(steps);
assertEquals(refactored, Sets.newHashSet(step1, step2, step3, step4, step7, step8));
}