Is it possible to read and write Parquet using Java without a dependency on Hadoop and HDFS? Is it possible to read and write Parquet using Java without a dependency on Hadoop and HDFS? hadoop hadoop

Is it possible to read and write Parquet using Java without a dependency on Hadoop and HDFS?


You can write parquet format out side hadoop cluster using java Parquet Client API.

Here is a sample code in java which writes parquet format to local disk.

import org.apache.avro.Schema;import org.apache.avro.generic.GenericData;import org.apache.avro.generic.GenericRecord;import org.apache.hadoop.fs.Path;import org.apache.parquet.avro.AvroSchemaConverter;import org.apache.parquet.avro.AvroWriteSupport;import org.apache.parquet.hadoop.ParquetWriter;import org.apache.parquet.hadoop.metadata.CompressionCodecName;import org.apache.parquet.schema.MessageType;public class Test {    void test() throws IOException {        final String schemaLocation = "/tmp/avro_format.json";        final Schema avroSchema = new Schema.Parser().parse(new File(schemaLocation));        final MessageType parquetSchema = new AvroSchemaConverter().convert(avroSchema);        final WriteSupport<Pojo> writeSupport = new AvroWriteSupport(parquetSchema, avroSchema);        final String parquetFile = "/tmp/parquet/data.parquet";        final Path path = new Path(parquetFile);        ParquetWriter<GenericRecord> parquetWriter = new ParquetWriter(path, writeSupport, CompressionCodecName.SNAPPY, BLOCK_SIZE, PAGE_SIZE);        final GenericRecord record = new GenericData.Record(avroSchema);        record.put("id", 1);        record.put("age", 10);        record.put("name", "ABC");        record.put("place", "BCD");        parquetWriter.write(record);        parquetWriter.close();    }}

avro_format.json,

{   "type":"record",   "name":"Pojo",   "namespace":"com.xx.test",   "fields":[      {         "name":"id",         "type":[            "int",            "null"         ]      },      {         "name":"age",         "type":[            "int",            "null"         ]      },      {         "name":"name",         "type":[            "string",            "null"         ]      },      {         "name":"place",         "type":[            "string",            "null"         ]      }   ]}

Hope this helps.