本文整理汇总了Java中org.apache.parquet.hadoop.example.GroupWriteSupport类的典型用法代码示例。如果您正苦于以下问题:Java GroupWriteSupport类的具体用法?Java GroupWriteSupport怎么用?Java GroupWriteSupport使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
GroupWriteSupport类属于org.apache.parquet.hadoop.example包,在下文中一共展示了GroupWriteSupport类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: generateEmptyWithSchema
import org.apache.parquet.hadoop.example.GroupWriteSupport; //导入依赖的package包/类
static File generateEmptyWithSchema(File parentDir, String filename) throws IOException {
File f = new File(parentDir, filename);
Configuration conf = new Configuration();
MessageType schema = parseMessageType(
"message test { "
+ "required int32 int32_field; "
+ "required int64 int64_field; "
+ "required float float_field; "
+ "required double double_field; "
+ "required int64 timestamp_field (TIMESTAMP_MILLIS);"
+ "} ");
GroupWriteSupport.setSchema(schema, conf);
SimpleGroupFactory fact = new SimpleGroupFactory(schema);
ParquetWriter<Group> writer = new ParquetWriter<Group>(new Path(f.getPath()), new GroupWriteSupport(),
UNCOMPRESSED, 1024, 1024, 512, false, false, ParquetProperties.WriterVersion.PARQUET_2_0, conf);
writer.close();
return f;
}
示例2: generateSparseParquetFile
import org.apache.parquet.hadoop.example.GroupWriteSupport; //导入依赖的package包/类
static File generateSparseParquetFile(File parentDir, String filename, int nrows) throws IOException {
File f = new File(parentDir, filename);
Configuration conf = new Configuration();
MessageType schema = parseMessageType(
"message test { optional int32 int32_field; optional binary string_field (UTF8); required int32 row; optional int32 int32_field2; } ");
GroupWriteSupport.setSchema(schema, conf);
SimpleGroupFactory fact = new SimpleGroupFactory(schema);
ParquetWriter<Group> writer = new ParquetWriter<Group>(new Path(f.getPath()), new GroupWriteSupport(),
UNCOMPRESSED, 1024, 1024, 512, true, false, ParquetProperties.WriterVersion.PARQUET_2_0, conf);
try {
for (int i = 0; i < nrows; i++) {
Group g = fact.newGroup();
if (i % 10 == 0) { g = g.append("int32_field", i); }
if (i % 10 == 0) { g = g.append("string_field", "CAT_" + (i % 10)); }
if (i % 10 == 0) { g = g.append("int32_field2", i); }
writer.write(g.append("row", i));
}
} finally {
writer.close();
}
return f;
}
示例3: generateParquetFileWithNullCharacters
import org.apache.parquet.hadoop.example.GroupWriteSupport; //导入依赖的package包/类
static File generateParquetFileWithNullCharacters(File parentDir, String filename, int nrows) throws IOException {
File f = new File(parentDir, filename);
Configuration conf = new Configuration();
MessageType schema = parseMessageType(
"message test { optional binary cat_field (UTF8); } ");
GroupWriteSupport.setSchema(schema, conf);
SimpleGroupFactory fact = new SimpleGroupFactory(schema);
ParquetWriter<Group> writer = new ParquetWriter<Group>(new Path(f.getPath()), new GroupWriteSupport(),
UNCOMPRESSED, 1024, 1024, 512, true, false, ParquetProperties.WriterVersion.PARQUET_2_0, conf);
try {
for (int i = 0; i < nrows; i++) {
Group g = fact.newGroup();
String value = i == 66 ? "CAT_0_weird\0" : "CAT_" + (i % 10);
writer.write(g.append("cat_field", value));
}
} finally {
writer.close();
}
return f;
}
示例4: runMapReduceJob
import org.apache.parquet.hadoop.example.GroupWriteSupport; //导入依赖的package包/类
private void runMapReduceJob(CompressionCodecName codec) throws IOException, ClassNotFoundException, InterruptedException {
final FileSystem fileSystem = parquetPath.getFileSystem(conf);
fileSystem.delete(parquetPath, true);
fileSystem.delete(outputPath, true);
{
jobConf.setInputFormat(TextInputFormat.class);
TextInputFormat.addInputPath(jobConf, inputPath);
jobConf.setNumReduceTasks(0);
jobConf.setOutputFormat(DeprecatedParquetOutputFormat.class);
DeprecatedParquetOutputFormat.setCompression(jobConf, codec);
DeprecatedParquetOutputFormat.setOutputPath(jobConf, parquetPath);
DeprecatedParquetOutputFormat.setWriteSupportClass(jobConf, GroupWriteSupport.class);
GroupWriteSupport.setSchema(MessageTypeParser.parseMessageType(writeSchema), jobConf);
jobConf.setMapperClass(DeprecatedMapper.class);
mapRedJob = JobClient.runJob(jobConf);
}
}
示例5: prepareFile
import org.apache.parquet.hadoop.example.GroupWriteSupport; //导入依赖的package包/类
@BeforeClass
public static void prepareFile() throws IOException {
cleanup();
GroupWriteSupport.setSchema(schema, conf);
SimpleGroupFactory f = new SimpleGroupFactory(schema);
ParquetWriter<Group> writer = ExampleParquetWriter.builder(file)
.withWriterVersion(PARQUET_1_0)
.withCompressionCodec(GZIP)
.withRowGroupSize(1024*1024)
.withPageSize(1024)
.enableDictionaryEncoding()
.withDictionaryPageSize(2*1024)
.withConf(conf)
.build();
writeData(f, writer);
}
示例6: readFile
import org.apache.parquet.hadoop.example.GroupWriteSupport; //导入依赖的package包/类
public static List<Group> readFile(File f, Filter filter) throws IOException {
Configuration conf = new Configuration();
GroupWriteSupport.setSchema(schema, conf);
ParquetReader<Group> reader =
ParquetReader.builder(new GroupReadSupport(), new Path(f.getAbsolutePath()))
.withConf(conf)
.withFilter(filter)
.build();
Group current;
List<Group> users = new ArrayList<Group>();
current = reader.read();
while (current != null) {
users.add(current);
current = reader.read();
}
return users;
}
示例7: writeAndTest
import org.apache.parquet.hadoop.example.GroupWriteSupport; //导入依赖的package包/类
public static void writeAndTest(WriteContext context) throws IOException {
// Create the configuration, and then apply the schema to our configuration.
Configuration configuration = new Configuration();
GroupWriteSupport.setSchema(context.schema, configuration);
GroupWriteSupport groupWriteSupport = new GroupWriteSupport();
// Create the writer properties
final int blockSize = context.blockSize;
final int pageSize = context.pageSize;
final int dictionaryPageSize = pageSize;
final boolean enableDictionary = context.enableDictionary;
final boolean enableValidation = context.enableValidation;
ParquetProperties.WriterVersion writerVersion = context.version;
CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
ParquetWriter<Group> writer = new ParquetWriter<Group>(context.fsPath,
groupWriteSupport, codec, blockSize, pageSize, dictionaryPageSize,
enableDictionary, enableValidation, writerVersion, configuration);
context.write(writer);
writer.close();
context.test();
context.path.delete();
}
示例8: run
import org.apache.parquet.hadoop.example.GroupWriteSupport; //导入依赖的package包/类
public void run() {
Configuration conf = new Configuration();
int blockSize = 1 * 1024;
int pageSize = 1 * 1024;
int dictionaryPageSize = 512;
boolean enableDictionary = false;
boolean validating = false;
Path basePath = new Path("file:///Users/Jelly/Developer/test");
MessageType schema = MessageTypeParser.parseMessageType("message test {" +
"required binary id; " +
"required binary content; " +
"required int64 int64_field; " +
"}");
GroupWriteSupport writeSupport = new GroupWriteSupport();
writeSupport.setSchema(schema, conf);
SimpleGroupFactory groupFactory = new SimpleGroupFactory(schema);
try {
ParquetWriter<Group> parquetWriter = new ParquetWriter(
basePath,
writeSupport,
CompressionCodecName.UNCOMPRESSED,
blockSize, pageSize, dictionaryPageSize,
enableDictionary,
validating,
ParquetProperties.WriterVersion.PARQUET_2_0,
conf);
for (int i = 0; i < 50000; i++) {
parquetWriter.write(groupFactory.newGroup()
.append("id", "10")
.append("content", "test" + i)
.append("int64_field", Long.valueOf(i)));
}
parquetWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
示例9: test
import org.apache.parquet.hadoop.example.GroupWriteSupport; //导入依赖的package包/类
@Test
public void test() throws IOException
{
Type name = new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.BINARY, "name");
Type age = new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.INT32, "age");
Type score = new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.DOUBLE, "score");
Type student = new MessageType("student", Arrays.asList(name, age, score));
MessageType schema = new MessageType("student", student);
int blockSize = 256 * 1024 * 1024;
int pageSize = 6 * 1024;
int dictionaryPageSize = 512;
boolean enableDictionary = false;
boolean validating = false;
GroupWriteSupport groupWriteSupport = new GroupWriteSupport();
SimpleGroupFactory simpleGroupFactory = new SimpleGroupFactory(schema);
Configuration conf = new Configuration();
conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());
Path path = new Path("hdfs://127.0.0.1:9000/student.parquet");
groupWriteSupport.setSchema(schema, conf);
ParquetWriter parquetWriter = new ParquetWriter(
path,
groupWriteSupport,
CompressionCodecName.UNCOMPRESSED,
blockSize,
pageSize,
dictionaryPageSize,
enableDictionary,
validating,
ParquetProperties.WriterVersion.PARQUET_2_0,
conf);
}
示例10: initWriter
import org.apache.parquet.hadoop.example.GroupWriteSupport; //导入依赖的package包/类
public static ParquetWriter<Group> initWriter(MessageType schema, String fileName) throws IOException {
GroupWriteSupport.setSchema(schema, conf);
ParquetWriter<Group> writer =
new ParquetWriter<Group>(initFile(fileName),
ParquetFileWriter.Mode.OVERWRITE,
new GroupWriteSupport(),
CompressionCodecName.SNAPPY,
1024,
1024,
512,
true, // enable dictionary encoding,
false,
ParquetProperties.WriterVersion.PARQUET_1_0, conf
);
return writer;
}
示例11: generateParquetFile
import org.apache.parquet.hadoop.example.GroupWriteSupport; //导入依赖的package包/类
static File generateParquetFile(File parentDir, String filename, int nrows, Date date) throws IOException {
File f = new File(parentDir, filename);
Configuration conf = new Configuration();
MessageType schema = parseMessageType(
"message test { "
+ "required int32 int32_field; "
+ "required int64 int64_field; "
+ "required float float_field; "
+ "required double double_field; "
+ "required int64 timestamp_field (TIMESTAMP_MILLIS);"
+ "} ");
GroupWriteSupport.setSchema(schema, conf);
SimpleGroupFactory fact = new SimpleGroupFactory(schema);
ParquetWriter<Group> writer = new ParquetWriter<Group>(new Path(f.getPath()), new GroupWriteSupport(),
UNCOMPRESSED, 1024, 1024, 512, true, false, ParquetProperties.WriterVersion.PARQUET_2_0, conf);
try {
for (int i = 0; i < nrows; i++) {
writer.write(fact.newGroup()
.append("int32_field", 32 + i)
.append("int64_field", 64L + i)
.append("float_field", 1.0f + i)
.append("double_field", 2.0d + i)
.append("timestamp_field", date.getTime() + (i * 117))
);
}
} finally {
writer.close();
}
return f;
}
示例12: testWriteReadStatisticsAllNulls
import org.apache.parquet.hadoop.example.GroupWriteSupport; //导入依赖的package包/类
@Test
public void testWriteReadStatisticsAllNulls() throws Exception {
// this test assumes statistics will be read
Assume.assumeTrue(!shouldIgnoreStatistics(Version.FULL_VERSION, BINARY));
File testFile = temp.newFile();
testFile.delete();
writeSchema = "message example {\n" +
"required binary content (UTF8);\n" +
"}";
Path path = new Path(testFile.toURI());
MessageType schema = MessageTypeParser.parseMessageType(writeSchema);
Configuration configuration = new Configuration();
configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
GroupWriteSupport.setSchema(schema, configuration);
ParquetWriter<Group> writer = new ParquetWriter<Group>(path, configuration, new GroupWriteSupport());
Group r1 = new SimpleGroup(schema);
writer.write(r1);
writer.close();
ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
// assert the statistics object is not empty
org.apache.parquet.column.statistics.Statistics stats = readFooter.getBlocks().get(0).getColumns().get(0).getStatistics();
assertFalse("is empty: " + stats, stats.isEmpty());
// assert the number of nulls are correct for the first block
assertEquals("nulls: " + stats, 1, stats.getNumNulls());
}
示例13: writeFile
import org.apache.parquet.hadoop.example.GroupWriteSupport; //导入依赖的package包/类
private static void writeFile(File out, Configuration conf, boolean useSchema2) throws IOException {
if (!useSchema2) {
GroupWriteSupport.setSchema(schema, conf);
} else {
GroupWriteSupport.setSchema(schema2, conf);
}
SimpleGroupFactory f = new SimpleGroupFactory(schema);
Map<String, String> extraMetaData = new HashMap<String, String>();
extraMetaData.put("schema_num", useSchema2 ? "2" : "1" );
ParquetWriter<Group> writer = ExampleParquetWriter
.builder(new Path(out.getAbsolutePath()))
.withConf(conf)
.withExtraMetaData(extraMetaData)
.build();
for (int i = 0; i < 1000; i++) {
Group g = f.newGroup()
.append("binary_field", "test" + i)
.append("int32_field", i)
.append("int64_field", (long) i)
.append("boolean_field", i % 2 == 0)
.append("float_field", (float) i)
.append("double_field", (double)i)
.append("flba_field", "foo");
if (!useSchema2) {
g = g.append("int96_field", Binary.fromConstantByteArray(new byte[12]));
}
writer.write(g);
}
writer.close();
}
示例14: setUp
import org.apache.parquet.hadoop.example.GroupWriteSupport; //导入依赖的package包/类
@Before
public void setUp() throws Exception {
parquetOutputFormat = new ParquetOutputFormat(new GroupWriteSupport());
GroupWriteSupport.setSchema(MessageTypeParser.parseMessageType(writeSchema), conf);
expectedPoolSize = Math.round((double)
ManagementFactory.getMemoryMXBean().getHeapMemoryUsage().getMax() *
MemoryManager.DEFAULT_MEMORY_POOL_RATIO);
long rowGroupSize = expectedPoolSize / 2;
conf.setLong(ParquetOutputFormat.BLOCK_SIZE, rowGroupSize);
// the memory manager is not initialized until a writer is created
createWriter(0).close(null);
}
示例15: writeToFile
import org.apache.parquet.hadoop.example.GroupWriteSupport; //导入依赖的package包/类
public static void writeToFile(File f, List<User> users) throws IOException {
Configuration conf = new Configuration();
GroupWriteSupport.setSchema(schema, conf);
ParquetWriter<Group> writer = new ParquetWriter<Group>(new Path(f.getAbsolutePath()), conf, new GroupWriteSupport());
for (User u : users) {
writer.write(groupFromUser(u));
}
writer.close();
}