本文整理汇总了Java中parquet.hadoop.ParquetInputSplit类的典型用法代码示例。如果您正苦于以下问题:Java ParquetInputSplit类的具体用法?Java ParquetInputSplit怎么用?Java ParquetInputSplit使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
ParquetInputSplit类属于parquet.hadoop包,在下文中一共展示了ParquetInputSplit类的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getSplits
import parquet.hadoop.ParquetInputSplit; //导入依赖的package包/类
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
if (isTaskSideMetaData(job)) {
return super.getSplits(job, numSplits);
}
List<Footer> footers = getFooters(job);
List<ParquetInputSplit> splits = realInputFormat.getSplits(job, footers);
if (splits == null) {
return null;
}
InputSplit[] resultSplits = new InputSplit[splits.size()];
int i = 0;
for (ParquetInputSplit split : splits) {
resultSplits[i++] = new ParquetInputSplitWrapper(split);
}
return resultSplits;
}
示例2: createRecordReader
import parquet.hadoop.ParquetInputSplit; //导入依赖的package包/类
@Test
public void createRecordReader() throws Exception {
String parquetFilePath = getClass().getClassLoader().getResource( "sample.pqt" ).toExternalForm();
PentahoParquetInputFormat pentahoParquetInputFormat = new PentahoParquetInputFormat();
pentahoParquetInputFormat.setInputFile( getClass().getClassLoader().getResource( "sample.pqt" ).toExternalForm() );
SchemaDescription schema = pentahoParquetInputFormat.readSchema( parquetFilePath );
pentahoParquetInputFormat.setSchema( schema );
ParquetInputSplit parquetInputSplit = Mockito.spy( ParquetInputSplit.class );
Whitebox.setInternalState( parquetInputSplit, "rowGroupOffsets", new long[] { 4 } );
Whitebox.setInternalState( parquetInputSplit, "file", new Path( parquetFilePath ) );
PentahoInputSplitImpl pentahoInputSplit = new PentahoInputSplitImpl( parquetInputSplit );
IPentahoInputFormat.IPentahoRecordReader recordReader =
pentahoParquetInputFormat.createRecordReader( pentahoInputSplit );
Assert.assertNotNull( "recordReader should NOT be null!", recordReader );
Assert.assertTrue( "recordReader should be instance of IPentahoInputFormat.IPentahoRecordReader",
recordReader instanceof IPentahoInputFormat.IPentahoRecordReader );
}
示例3: readCreatedParquetFile
import parquet.hadoop.ParquetInputSplit; //导入依赖的package包/类
private IPentahoInputFormat.IPentahoRecordReader readCreatedParquetFile( String parquetFilePath ) {
IPentahoInputFormat.IPentahoRecordReader recordReader = null;
try {
PentahoParquetInputFormat pentahoParquetInputFormat = new PentahoParquetInputFormat();
pentahoParquetInputFormat.setInputFile( parquetFilePath );
SchemaDescription schema = pentahoParquetInputFormat.readSchema( parquetFilePath );
pentahoParquetInputFormat.setSchema( schema );
ParquetInputSplit parquetInputSplit = Mockito.spy( ParquetInputSplit.class );
Whitebox.setInternalState( parquetInputSplit, "rowGroupOffsets", new long[] { 4 } );
Whitebox.setInternalState( parquetInputSplit, "file", new org.apache.hadoop.fs.Path( parquetFilePath ) );
PentahoInputSplitImpl pentahoInputSplit = new PentahoInputSplitImpl( parquetInputSplit );
recordReader = pentahoParquetInputFormat.createRecordReader( pentahoInputSplit );
} catch ( Exception e ) {
e.printStackTrace();
}
return recordReader;
}
示例4: map
import parquet.hadoop.ParquetInputSplit; //导入依赖的package包/类
@Override
public void map(LongWritable key, Group value, Context context) throws IOException, InterruptedException {
NullWritable outKey = NullWritable.get();
if(expectedFields == null) {
// Get the file schema which may be different from the fields in a particular record) from the input split
String fileSchema = ((ParquetInputSplit)context.getInputSplit()).getFileSchema();
// System.err.println("file schema from context: " + fileSchema);
RecordSchema schema = new RecordSchema(fileSchema);
expectedFields = schema.getFields();
//System.err.println("inferred schema: " + expectedFields.toString());
}
// No public accessor to the column values in a Group, so extract them from the string representation
String line = value.toString();
String[] fields = line.split("\n");
StringBuilder csv = new StringBuilder();
boolean hasContent = false;
int i = 0;
// Look for each expected column
Iterator<FieldDescription> it = expectedFields.iterator();
while(it.hasNext()) {
if(hasContent ) {
csv.append(',');
}
String name = it.next().name;
if(fields.length > i) {
String[] parts = fields[i].split(": ");
// We assume proper order, but there may be fields missing
if(parts[0].equals(name)) {
boolean mustQuote = (parts[1].contains(",") || parts[1].contains("'"));
if(mustQuote) {
csv.append('"');
}
csv.append(parts[1]);
if(mustQuote) {
csv.append('"');
}
hasContent = true;
i++;
}
}
}
context.write(outKey, new Text(csv.toString()));
}
示例5: ParquetInputSplitWrapper
import parquet.hadoop.ParquetInputSplit; //导入依赖的package包/类
public ParquetInputSplitWrapper(ParquetInputSplit realSplit) {
this.realSplit = realSplit;
}
示例6: readFields
import parquet.hadoop.ParquetInputSplit; //导入依赖的package包/类
@Override
public void readFields(DataInput in) throws IOException {
realSplit = new ParquetInputSplit();
realSplit.readFields(in);
}
示例7: test11
import parquet.hadoop.ParquetInputSplit; //导入依赖的package包/类
@Test
public void test11() throws IOException, InterruptedException {
// verify locations in order
ArrayList<InputSplit> rawSplits = new ArrayList<InputSplit>();
// first split is parquetinputsplit
rawSplits.add(new ParquetInputSplit(new Path("path1"), 0, 100,
new String[] { "l1", "l2", "l3" },
new ArrayList<BlockMetaData>(), "", "",
new HashMap<String, String>(), new HashMap<String, String>()));
// second split is file split
rawSplits.add(new FileSplit(new Path("path2"), 0, 400, new String[] {
"l5", "l6", "l1" }));
List<InputSplit> result = pigInputFormat.getPigSplits(rawSplits, 0, ok,
null, true, conf);
// pig combines two into one pigsplit
Assert.assertEquals(result.size(), 1);
for (InputSplit split : result) {
PigSplit pigSplit = (PigSplit) split;
// write to a byte array output stream
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
DataOutput out = new DataOutputStream(outputStream);
pigSplit.write(out);
// restore the pig split from the byte array
ByteArrayInputStream inputStream = new ByteArrayInputStream(
outputStream.toByteArray());
DataInput in = new DataInputStream(inputStream);
PigSplit anotherSplit = new PigSplit();
anotherSplit.setConf(conf);
anotherSplit.readFields(in);
Assert.assertEquals(500, anotherSplit.getLength());
Assert.assertEquals(2, anotherSplit.getNumPaths());
Assert.assertEquals("parquet.hadoop.ParquetInputSplit",
(anotherSplit.getWrappedSplit(0).getClass().getName()));
Assert.assertEquals(
"org.apache.hadoop.mapreduce.lib.input.FileSplit",
(anotherSplit.getWrappedSplit(1).getClass().getName()));
}
}