diff --git a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/TypeDesc.java b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/TypeDesc.java
index 3bd0f006a6..3ca83f987b 100644
--- a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/TypeDesc.java
+++ b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/TypeDesc.java
@@ -55,6 +55,10 @@ public boolean equals(Object obj) {
}
}
+ public Schema getNestedSchema() {
+ return nestedRecordSchema;
+ }
+
public int hashCode() {
return Objects.hashCode(dataType.hashCode(), nestedRecordSchema);
}
diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml b/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml
index 1a8a188b79..d8484613fb 100644
--- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml
+++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml
@@ -33,8 +33,6 @@
UTF-8UTF-8
- 1.5.0
- 2.1.0
@@ -136,19 +134,35 @@
org.apache.hadoop
- hadoop-mapreduce-client-core
+ hadoop-common${hadoop.version}provided
+
+
+ zookeeper
+ org.apache.zookeeper
+
+ org.apache.hadoop
- hadoop-common
+ hadoop-mapreduce-client-core${hadoop.version}provided
+
+
+ hadoop-yarn-common
+ org.apache.hadoop
+
+
+ netty
+ io.netty
+
+ org.apache.hive
- hive-exec
+ hive-metastore${hive.version}provided
@@ -158,129 +172,201 @@
org.apache.hive
- hive-contrib
+ hive-serdeorg.apache.hive
- hive-hbase-handler
+ hive-shimss
- org.apache.hive
- hive-metastore
+ org.apache.thrift
+ libfb303
- org.apache.hive
- hive-serde
+ org.apache.thrift
+ libthrift
- org.apache.hive
- hive-shims
+ com.jolbox
+ bonecp
- org.apache.hive
- hive-testutils
+ tephra-hbase-compat-1.0
+ co.cask.tephra
- org.apache.thrift
- libfb303
+ tephra-core
+ co.cask.tephra
- org.apache.thrift
- libthrift
+ tephra-api
+ co.cask.tephra
- com.jolbox
- bonecp
+ hbase-client
+ org.apache.hbase
- com.google.protobuf
- protobuf-java
+ hadoop-yarn-server-resourcemanager
+ org.apache.hadoop
- org.apache.calcite
- calcite-core
+ antlr-runtime
+ org.antlr
- org.apache.calcite
- calcite-avatica
+ log4j-slf4j-impl
+ org.apache.logging.log4j
+
+
+ zookeeper
+ org.apache.zookeeperorg.apache.hive
- hive-metastore
+ hive-common${hive.version}provided
- org.apache.hive
- hive-common
+ jetty-all
+ org.eclipse.jetty.aggregate
- org.apache.hive
- hive-serde
+ javax.servlet
+ org.eclipse.jetty.orbit
- org.apache.hive
- hive-shimss
+ joda-time
+ joda-time
- org.apache.thrift
- libfb303
+ jackson-databind
+ com.fasterxml.jackson.core
- org.apache.thrift
- libthrift
+ metrics-json
+ io.dropwizard.metrics
- com.jolbox
- bonecp
+ metrics-jvm
+ io.dropwizard.metrics
+
+
+ metrics-core
+ io.dropwizard.metrics
+
+
+ ant
+ org.apache.ant
+
+
+ json
+ org.json
+
+
+ log4j-slf4j-impl
+ org.apache.logging.log4j
+
+
+ log4j-web
+ org.apache.logging.log4j
+
+
+ log4j-1.2-api
+ org.apache.logging.log4jorg.apache.hive
- hive-cli
+ hive-exec${hive.version}provided
+ hive-antorg.apache.hive
- hive-common
+ hive-llap-tezorg.apache.hive
- hive-exec
- org.apache.hive
- hive-metastore
+ ST4
+ org.antlr
- org.apache.hive
- hive-serde
+ ivy
+ org.apache.ivy
- org.apache.hive
- hive-service
+ curator-framework
+ org.apache.curator
- org.apache.hive
- hive-shims
+ apache-curator
+ org.apache.curator
- com.jolbox
- bonecp
+ groovy-all
+ org.codehaus.groovy
+
+
+ calcite-core
+ org.apache.calcite
+
+
+ calcite-avatica
+ org.apache.calcite
+
+
+ stax-api
+ stax
- jlinejline
+ jline
+
+
+ log4j-1.2-api
+ org.apache.logging.log4j
+
+
+ log4j-slf4j-impl
+ org.apache.logging.log4j
+
+
+ ant
+ org.apache.ant
+
+
+ zookeeper
+ org.apache.zookeeper
+
+
+ antlr-runtime
+ org.antlr
+
+
+
+
+ org.apache.hive
+ hive-serde
+ ${hive.version}
+ provided
+
+
+ opencsv
+ net.sf.opencsv
- com.twitter
- parquet-hive-bundle
+ org.apache.parquet
+ parquet-hadoop-bundle${parquet.version}
diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java
index 63f18b6f75..95cbf18fa8 100644
--- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java
+++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java
@@ -38,12 +38,15 @@
import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
import org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe;
import org.apache.hadoop.mapred.TextInputFormat;
+import org.apache.orc.OrcConf;
+import org.apache.parquet.hadoop.ParquetOutputFormat;
import org.apache.tajo.BuiltinStorages;
import org.apache.tajo.TajoConstants;
import org.apache.tajo.algebra.Expr;
import org.apache.tajo.algebra.IsNullPredicate;
import org.apache.tajo.algebra.JsonHelper;
import org.apache.tajo.catalog.*;
+import org.apache.tajo.catalog.TableMeta;
import org.apache.tajo.catalog.partition.PartitionMethodDesc;
import org.apache.tajo.catalog.proto.CatalogProtos;
import org.apache.tajo.catalog.proto.CatalogProtos.*;
@@ -56,10 +59,8 @@
import org.apache.tajo.storage.StorageConstants;
import org.apache.tajo.util.KeyValueSet;
import org.apache.thrift.TException;
-import parquet.hadoop.ParquetOutputFormat;
import java.io.File;
-import java.io.IOException;
import java.util.*;
public class HiveCatalogStore extends CatalogConstants implements CatalogStore {
@@ -564,6 +565,16 @@ public final void createTable(final CatalogProtos.TableDescProto tableDescProto)
table.putToParameters(ParquetOutputFormat.COMPRESSION,
tableDesc.getMeta().getProperty(ParquetOutputFormat.COMPRESSION));
}
+ } else if (tableDesc.getMeta().getDataFormat().equalsIgnoreCase(BuiltinStorages.ORC)) {
+ StorageFormatDescriptor descriptor = storageFormatFactory.get(IOConstants.ORC);
+ sd.setInputFormat(descriptor.getInputFormat());
+ sd.setOutputFormat(descriptor.getOutputFormat());
+ sd.getSerdeInfo().setSerializationLib(descriptor.getSerde());
+
+ if (tableDesc.getMeta().containsProperty(OrcConf.COMPRESS.getAttribute())) {
+ table.putToParameters(OrcConf.COMPRESS.getAttribute(),
+ tableDesc.getMeta().getProperty(OrcConf.COMPRESS.getAttribute()));
+ }
} else {
throw new UnsupportedException(tableDesc.getMeta().getDataFormat() + " in HivecatalogStore");
}
diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogUtil.java b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogUtil.java
index bbb7adeee3..87b391ea60 100644
--- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogUtil.java
+++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogUtil.java
@@ -22,6 +22,7 @@
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.ql.io.RCFileInputFormat;
+import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.serde.serdeConstants;
@@ -137,6 +138,8 @@ public static String getDataFormat(StorageDescriptor descriptor) {
return BuiltinStorages.PARQUET;
} else if (AvroSerDe.class.getName().equals(serde)) {
return BuiltinStorages.AVRO;
+ } else if (OrcSerde.class.getName().equals(serde)) {
+ return BuiltinStorages.ORC;
} else {
throw new TajoRuntimeException(new UnknownDataFormatException(inputFormat));
}
diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java
index 7e1a3a4ff6..46935fc259 100644
--- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java
+++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java
@@ -78,6 +78,7 @@ public static void setUp() throws Exception {
conf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, warehousePath.toUri().toString());
conf.set(HiveConf.ConfVars.METASTORECONNECTURLKEY.varname, jdbcUri);
conf.set(TajoConf.ConfVars.WAREHOUSE_DIR.varname, warehousePath.toUri().toString());
+ conf.setBoolean("datanucleus.schema.autoCreateAll", true);
// create local HiveCatalogStore.
TajoConf tajoConf = new TajoConf(conf);
diff --git a/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoDump.java b/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoDump.java
index 4df418f5be..c9fa2b488c 100644
--- a/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoDump.java
+++ b/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoDump.java
@@ -208,7 +208,7 @@ private static void dumpDatabase(TajoClient client, String databaseName, PrintWr
}
}
writer.write("\n\n");
- } catch (Exception e) {
+ } catch (Throwable e) {
// dump for each table can throw any exception. We need to skip the exception case.
// here, the error message prints out via stderr.
System.err.println("ERROR:" + tableName + "," + e.getMessage());
diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java
index 5b4c152a51..f69e7da2f4 100644
--- a/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java
+++ b/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java
@@ -125,7 +125,7 @@ public String toString() {
/**
*
- * @param tm TimeMEta
+ * @param tm TimeMeta
* @param timeZone Timezone
* @param includeTimeZone Add timezone if it is true. It is usually used for TIMEZONEZ
* @return A timestamp string
diff --git a/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java b/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java
index 097963cb25..4612323deb 100644
--- a/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java
+++ b/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java
@@ -89,11 +89,7 @@ public class StorageConstants {
public static final String DEFAULT_ORC_STRIPE_SIZE = "67108864"; // 64MB
public static final String ORC_COMPRESSION = "orc.compress";
- public static final String ORC_COMPRESSION_KIND_NONE = "none";
- public static final String ORC_COMPRESSION_KIND_SNAPPY = "snappy";
- public static final String ORC_COMPRESSION_KIND_LZO = "lzo";
- public static final String ORC_COMPRESSION_KIND_ZIP = "zlip";
- public static final String DEFAULT_ORC_COMPRESSION_KIND = ORC_COMPRESSION_KIND_NONE;
+ public static final String DEFAULT_ORC_COMPRESSION_KIND = "none";
public static final String ORC_BUFFER_SIZE = "orc.buffer.size";
public static final String DEFAULT_ORC_BUFFER_SIZE = "262144"; // 256KB
diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java
index e55acf1fc0..a2dec50e91 100644
--- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java
+++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java
@@ -682,25 +682,6 @@ public void testLoadIntoTimezonedTable() throws Exception {
executeString("DROP TABLE IF EXISTS timezoned_load2 PURGE");
}
}
-
- @Test
- public void testTimezonedORCTable() throws Exception {
- try {
-
- executeDDL("datetime_table_timezoned_ddl.sql", "timezoned", "timezoned");
- executeDDL("datetime_table_timezoned_orc_ddl.sql", null, "timezoned_orc");
-
- executeString("INSERT OVERWRITE INTO timezoned_orc SELECT t_timestamp, t_date FROM timezoned");
-
- ResultSet res = executeQuery();
- assertResultSet(res, "testTimezonedORCTable.result");
- executeString("SET TIME ZONE 'GMT'");
- cleanupQuery(res);
- } finally {
- executeString("DROP TABLE IF EXISTS timezoned");
- executeString("DROP TABLE IF EXISTS timezoned_orc PURGE");
- }
- }
@Test
public void testMultiBytesDelimiter1() throws Exception {
diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/storage/TestQueryOnOrcFile.java b/tajo-core-tests/src/test/java/org/apache/tajo/storage/TestQueryOnOrcFile.java
new file mode 100644
index 0000000000..29d132e35f
--- /dev/null
+++ b/tajo-core-tests/src/test/java/org/apache/tajo/storage/TestQueryOnOrcFile.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage;
+
+import org.apache.tajo.IntegrationTest;
+import org.apache.tajo.QueryTestCaseBase;
+import org.junit.*;
+import org.junit.experimental.categories.Category;
+
+import java.sql.ResultSet;
+
+@Category(IntegrationTest.class)
+public class TestQueryOnOrcFile extends QueryTestCaseBase {
+
+ @Before
+ public void setup() throws Exception {
+ executeDDL("datetime_table_timezoned_ddl.sql", "timezoned", "timezoned");
+ executeDDL("datetime_table_timezoned_orc_ddl.sql", null, "timezoned_orc");
+
+ executeString("INSERT OVERWRITE INTO timezoned_orc SELECT t_timestamp, t_date FROM timezoned");
+ }
+
+ @After
+ public void teardown() throws Exception {
+ executeString("DROP TABLE IF EXISTS timezoned");
+ executeString("DROP TABLE IF EXISTS timezoned_orc PURGE");
+ }
+
+ @Test
+ public void testTimezone1() throws Exception {
+ executeString("SET TIME ZONE 'GMT+9'");
+ ResultSet res = executeQuery();
+ assertResultSet(res);
+ executeString("SET TIME ZONE 'GMT'");
+ cleanupQuery(res);
+ }
+
+ @Test
+ public void testTimezone2() throws Exception {
+ executeString("SET TIME ZONE 'GMT+1'");
+ ResultSet res = executeString("select * from timezoned_orc");
+ assertResultSet(res);
+ executeString("SET TIME ZONE 'GMT'");
+ cleanupQuery(res);
+ }
+
+ @Test
+ public void testTimezone3() throws Exception {
+ executeString("SET TIME ZONE 'GMT'");
+ ResultSet res = executeString("select * from timezoned_orc");
+ assertResultSet(res);
+ cleanupQuery(res);
+ }
+
+ @Test
+ public void testTimezone4() throws Exception {
+ executeString("\\set TIMEZONE 'GMT-5'");
+ ResultSet res = executeString("select * from timezoned_orc");
+ assertResultSet(res);
+ executeString("SET TIME ZONE 'GMT'");
+ cleanupQuery(res);
+ }
+}
diff --git a/tajo-core-tests/src/test/resources/dataset/TestSelectQuery/timezoned/table1.tbl b/tajo-core-tests/src/test/resources/dataset/TestQueryOnOrcFile/timezoned/timezoned1.tbl
similarity index 100%
rename from tajo-core-tests/src/test/resources/dataset/TestSelectQuery/timezoned/table1.tbl
rename to tajo-core-tests/src/test/resources/dataset/TestQueryOnOrcFile/timezoned/timezoned1.tbl
diff --git a/tajo-core-tests/src/test/resources/dataset/TestSelectQuery/timezoned/timezoned1.tbl b/tajo-core-tests/src/test/resources/dataset/TestSelectQuery/timezoned/timezoned1.tbl
new file mode 100644
index 0000000000..74b2e1b273
--- /dev/null
+++ b/tajo-core-tests/src/test/resources/dataset/TestSelectQuery/timezoned/timezoned1.tbl
@@ -0,0 +1,3 @@
+1980-4-1 01:50:30.010|01:50:30.010|1980-04-01
+80/4/1 1:50:30 AM|1:50:30 AM|80/4/1
+1980 April 1 1:50:30|1:50:30|1980-04-01
\ No newline at end of file
diff --git a/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/datetime_table_timezoned_ddl.sql b/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/datetime_table_timezoned_ddl.sql
new file mode 100644
index 0000000000..9c5d30d22c
--- /dev/null
+++ b/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/datetime_table_timezoned_ddl.sql
@@ -0,0 +1,5 @@
+CREATE EXTERNAL TABLE ${0} (
+ t_timestamp TIMESTAMP,
+ t_time TIME,
+ t_date DATE
+) USING TEXT WITH ('timezone' = 'GMT+9') LOCATION ${table.path}
diff --git a/tajo-core-tests/src/test/resources/queries/TestSelectQuery/datetime_table_timezoned_orc_ddl.sql b/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/datetime_table_timezoned_orc_ddl.sql
similarity index 100%
rename from tajo-core-tests/src/test/resources/queries/TestSelectQuery/datetime_table_timezoned_orc_ddl.sql
rename to tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/datetime_table_timezoned_orc_ddl.sql
diff --git a/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/testTimezone1.sql b/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/testTimezone1.sql
new file mode 100644
index 0000000000..2464c974ae
--- /dev/null
+++ b/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/testTimezone1.sql
@@ -0,0 +1 @@
+SELECT * FROM timezoned_orc;
\ No newline at end of file
diff --git a/tajo-core-tests/src/test/resources/queries/TestSelectQuery/testTimezonedORCTable.sql b/tajo-core-tests/src/test/resources/queries/TestSelectQuery/testTimezonedORCTable.sql
deleted file mode 100644
index 1d898bd73c..0000000000
--- a/tajo-core-tests/src/test/resources/queries/TestSelectQuery/testTimezonedORCTable.sql
+++ /dev/null
@@ -1,2 +0,0 @@
-SET SESSION TIMEZONE = 'GMT+9';
-SELECT * FROM timezoned_orc;
\ No newline at end of file
diff --git a/tajo-core-tests/src/test/resources/results/TestSelectQuery/testTimezonedORCTable.result b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone1.result
similarity index 100%
rename from tajo-core-tests/src/test/resources/results/TestSelectQuery/testTimezonedORCTable.result
rename to tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone1.result
diff --git a/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone2.result b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone2.result
new file mode 100644
index 0000000000..c0e5ceffe1
--- /dev/null
+++ b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone2.result
@@ -0,0 +1,5 @@
+t_timestamp,t_date
+-------------------------------
+1980-03-31 17:50:30.01,1980-04-01
+1980-03-31 17:50:30,1980-04-01
+1980-03-31 17:50:30,1980-04-01
\ No newline at end of file
diff --git a/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone3.result b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone3.result
new file mode 100644
index 0000000000..916f4be8dd
--- /dev/null
+++ b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone3.result
@@ -0,0 +1,5 @@
+t_timestamp,t_date
+-------------------------------
+1980-03-31 16:50:30.01,1980-04-01
+1980-03-31 16:50:30,1980-04-01
+1980-03-31 16:50:30,1980-04-01
\ No newline at end of file
diff --git a/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone4.result b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone4.result
new file mode 100644
index 0000000000..98e0918610
--- /dev/null
+++ b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone4.result
@@ -0,0 +1,5 @@
+t_timestamp,t_date
+-------------------------------
+1980-03-31 11:50:30.01,1980-04-01
+1980-03-31 11:50:30,1980-04-01
+1980-03-31 11:50:30,1980-04-01
\ No newline at end of file
diff --git a/tajo-dist/pom.xml b/tajo-dist/pom.xml
index 095f128809..652ab84204 100644
--- a/tajo-dist/pom.xml
+++ b/tajo-dist/pom.xml
@@ -154,22 +154,14 @@
run cp -r ${project.basedir}/src/main/conf .
run rm -rf lib/tajo-*-${project.version}.jar
- run mkdir hive
- run mv lib/hive-*.jar hive/
-
+ run mkdir -p lib
+ run cp -r $ROOT/tajo-storage/tajo-storage-hdfs/target/lib/hive-*.jar lib/
+
run mkdir -p share/jdbc-dist
run cp -r $ROOT/tajo-jdbc/target/tajo-jdbc-${project.version}-jar-with-dependencies.jar ./share/jdbc-dist/tajo-jdbc-${project.version}.jar
run mkdir -p extlib
- if [ -f $ROOT/tajo-catalog/tajo-catalog-drivers/tajo-hive/target/lib/parquet-hive-bundle-*.jar ]
- then
- run cp -r $ROOT/tajo-catalog/tajo-catalog-drivers/tajo-hive/target/lib/parquet-hive-bundle-*.jar lib/
- echo
- echo "Tajo installed parquet-hive-bundle library at: ${project.build.directory}/tajo-${project.version}"
- echo
- fi
-
echo
echo "Tajo dist layout available at: ${project.build.directory}/tajo-${project.version}"
echo
diff --git a/tajo-dist/src/main/bin/tajo b/tajo-dist/src/main/bin/tajo
index c08c538201..007e960ffb 100755
--- a/tajo-dist/src/main/bin/tajo
+++ b/tajo-dist/src/main/bin/tajo
@@ -300,11 +300,15 @@ if [ ! -z ${HIVE_HOME} ] && [ -d ${HIVE_HOME} ] && [ -d ${HIVE_LIB} ]; then
CLASSPATH=${CLASSPATH}:$f;
done
- for f in ${HIVE_LIB}/datanucleus-*.jar; do
+ for f in ${HIVE_LIB}/javax.jdo-*.jar; do
CLASSPATH=${CLASSPATH}:$f;
done
-else
- for f in $TAJO_HOME/hive/*.jar; do
+
+ for f in ${HIVE_LIB}/log4j-core-*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ for f in ${HIVE_LIB}/datanucleus-*.jar; do
CLASSPATH=${CLASSPATH}:$f;
done
fi
diff --git a/tajo-project/pom.xml b/tajo-project/pom.xml
index cd86d3b350..27fa66be32 100644
--- a/tajo-project/pom.xml
+++ b/tajo-project/pom.xml
@@ -36,10 +36,11 @@
2.7.22.5.01.1.1
- 1.1.0
+ 2.0.04.0.34.Final2.66.1.26
+ 1.8.1${project.parent.relativePath}/..src/main/hadoop-${hadoop.version}
diff --git a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml
index 7f4661b451..2454714452 100644
--- a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml
+++ b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml
@@ -130,7 +130,7 @@
tajo.storage.scanner-handler.orc.class
- org.apache.tajo.storage.orc.ORCScanner
+ org.apache.tajo.storage.orc.OrcScanner
diff --git a/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml b/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml
index 934dd01f24..1c4530a3cd 100644
--- a/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml
+++ b/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml
@@ -132,7 +132,7 @@
tajo.storage.scanner-handler.orc.class
- org.apache.tajo.storage.orc.ORCScanner
+ org.apache.tajo.storage.orc.OrcScanner
diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml
index 5f66395e94..aa6e6a66c2 100644
--- a/tajo-storage/tajo-storage-hdfs/pom.xml
+++ b/tajo-storage/tajo-storage-hdfs/pom.xml
@@ -34,7 +34,6 @@
UTF-8UTF-8
- 1.8.1
@@ -129,7 +128,6 @@
--proto_path=../../tajo-catalog/tajo-catalog-common/src/main/proto--java_out=target/generated-sources/protosrc/main/proto/StorageFragmentProtos.proto
- src/main/proto/orc_proto.proto
@@ -161,6 +159,26 @@
org.apache.maven.pluginsmaven-surefire-report-plugin
+
+ org.apache.maven.plugins
+ maven-dependency-plugin
+
+
+ copy-dependencies
+ package
+
+ copy-dependencies
+
+
+ runtime
+ ${project.build.directory}/lib
+ false
+ false
+ true
+
+
+
+
@@ -345,10 +363,16 @@
netty-buffer
- com.facebook.presto
- presto-orc
- 0.141
+ org.apache.hive
+ hive-orc
+ ${hive.version}
+
+ org.apache.hive
+ hive-storage-api
+ ${hive.version}
+
+
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java
index 7999d02487..b27c6401cf 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java
@@ -20,6 +20,9 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
+import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcConf;
+import org.apache.orc.TypeDescription;
import org.apache.tajo.TajoConstants;
import org.apache.tajo.TaskAttemptId;
import org.apache.tajo.catalog.Schema;
@@ -29,12 +32,13 @@
import org.apache.tajo.storage.StorageConstants;
import org.apache.tajo.storage.TableStatistics;
import org.apache.tajo.storage.Tuple;
-import org.apache.tajo.storage.orc.objectinspector.ObjectInspectorFactory;
-import org.apache.tajo.storage.thirdparty.orc.CompressionKind;
import org.apache.tajo.storage.thirdparty.orc.OrcFile;
+import org.apache.tajo.storage.thirdparty.orc.OrcFile.EncodingStrategy;
+import org.apache.tajo.storage.thirdparty.orc.OrcUtils;
import org.apache.tajo.storage.thirdparty.orc.Writer;
import java.io.IOException;
+import java.util.Properties;
import java.util.TimeZone;
public class ORCAppender extends FileAppender {
@@ -46,21 +50,14 @@ public ORCAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schem
TableMeta meta, Path workDir) {
super(conf, taskAttemptId, schema, meta, workDir);
- timezone = TimeZone.getTimeZone(meta.getProperty(StorageConstants.TIMEZONE,
- TajoConstants.DEFAULT_SYSTEM_TIMEZONE));
+ timezone = meta.containsProperty(StorageConstants.TIMEZONE) ?
+ TimeZone.getTimeZone(meta.getProperty(StorageConstants.TIMEZONE)) :
+ TimeZone.getDefault();
}
@Override
public void init() throws IOException {
- writer = OrcFile.createWriter(workDir.getFileSystem(conf), path, conf,
- ObjectInspectorFactory.buildStructObjectInspector(schema),
- Long.parseLong(meta.getProperty(StorageConstants.ORC_STRIPE_SIZE,
- StorageConstants.DEFAULT_ORC_STRIPE_SIZE)), getCompressionKind(),
- Integer.parseInt(meta.getProperty(StorageConstants.ORC_BUFFER_SIZE,
- StorageConstants.DEFAULT_ORC_BUFFER_SIZE)),
- Integer.parseInt(meta.getProperty(StorageConstants.ORC_ROW_INDEX_STRIDE,
- StorageConstants.DEFAULT_ORC_ROW_INDEX_STRIDE)),
- timezone);
+ writer = OrcFile.createWriter(path, buildWriterOptions(conf, meta, schema), timezone);
if (tableStatsEnabled) {
this.stats = new TableStatistics(schema, columnStatsEnabled);
@@ -90,7 +87,6 @@ public void flush() throws IOException {
public void close() throws IOException {
writer.close();
- // TODO: getOffset is not implemented yet
// if (tableStatsEnabled) {
// stats.setNumBytes(getOffset());
// }
@@ -107,24 +103,81 @@ public TableStats getStats() {
@Override
public long getEstimatedOutputSize() throws IOException {
- return writer.getRawDataSize() * writer.getNumberOfRows();
+ return writer.getRawDataSize();
}
- private CompressionKind getCompressionKind() {
- String kindstr = meta.getProperty(StorageConstants.ORC_COMPRESSION, StorageConstants.DEFAULT_ORC_COMPRESSION_KIND);
+ private static OrcFile.WriterOptions buildWriterOptions(Configuration conf, TableMeta meta, Schema schema) {
+ return OrcFile.writerOptions(conf)
+ .setSchema(OrcUtils.convertSchema(schema))
+ .compress(getCompressionKind(meta))
+ .stripeSize(Long.parseLong(meta.getProperty(OrcConf.STRIPE_SIZE.getAttribute(),
+ String.valueOf(OrcConf.STRIPE_SIZE.getDefaultValue()))))
+ .blockSize(Long.parseLong(meta.getProperty(OrcConf.BLOCK_SIZE.getAttribute(),
+ String.valueOf(OrcConf.BLOCK_SIZE.getDefaultValue()))))
+ .rowIndexStride(Integer.parseInt(meta.getProperty(OrcConf.ROW_INDEX_STRIDE.getAttribute(),
+ String.valueOf(OrcConf.ROW_INDEX_STRIDE.getDefaultValue()))))
+ .bufferSize(Integer.parseInt(meta.getProperty(OrcConf.BUFFER_SIZE.getAttribute(),
+ String.valueOf(OrcConf.BUFFER_SIZE.getDefaultValue()))))
+ .blockPadding(Boolean.parseBoolean(meta.getProperty(OrcConf.BLOCK_PADDING.getAttribute(),
+ String.valueOf(OrcConf.BLOCK_PADDING.getDefaultValue()))))
+ .encodingStrategy(EncodingStrategy.valueOf(meta.getProperty(OrcConf.ENCODING_STRATEGY.getAttribute(),
+ String.valueOf(OrcConf.ENCODING_STRATEGY.getDefaultValue()))))
+ .bloomFilterFpp(Double.parseDouble(meta.getProperty(OrcConf.BLOOM_FILTER_FPP.getAttribute(),
+ String.valueOf(OrcConf.BLOOM_FILTER_FPP.getDefaultValue()))))
+ .bloomFilterColumns(meta.getProperty(OrcConf.BLOOM_FILTER_COLUMNS.getAttribute(),
+ String.valueOf(OrcConf.BLOOM_FILTER_COLUMNS.getDefaultValue())));
+ }
+
+ private static CompressionKind getCompressionKind(TableMeta meta) {
+ String kindstr = meta.getProperty(OrcConf.COMPRESS.getAttribute(),
+ String.valueOf(OrcConf.COMPRESS.getDefaultValue()));
- if (kindstr.equalsIgnoreCase(StorageConstants.ORC_COMPRESSION_KIND_ZIP)) {
+ if (kindstr.equalsIgnoreCase(CompressionKind.ZLIB.name())) {
return CompressionKind.ZLIB;
}
- if (kindstr.equalsIgnoreCase(StorageConstants.ORC_COMPRESSION_KIND_SNAPPY)) {
+ if (kindstr.equalsIgnoreCase(CompressionKind.SNAPPY.name())) {
return CompressionKind.SNAPPY;
}
- if (kindstr.equalsIgnoreCase(StorageConstants.ORC_COMPRESSION_KIND_LZO)) {
+ if (kindstr.equalsIgnoreCase(CompressionKind.LZO.name())) {
return CompressionKind.LZO;
}
return CompressionKind.NONE;
}
+
+ /**
+ * Options for creating ORC file writers.
+ */
+ public static class WriterOptions extends OrcFile.WriterOptions {
+ // Setting the default batch size to 1000 makes the memory check at 5000
+ // rows work the same as the row by row writer. (If it was the default 1024,
+ // the smallest stripe size would be 5120 rows, which changes the output
+ // of some of the tests.)
+ private int batchSize = 1000;
+
+ public WriterOptions(Properties tableProperties, Configuration conf) {
+ super(tableProperties, conf);
+ }
+
+ /**
+ * Set the schema for the file. This is a required parameter.
+ * @param schema the schema for the file.
+ * @return this
+ */
+ public WriterOptions setSchema(TypeDescription schema) {
+ super.setSchema(schema);
+ return this;
+ }
+
+ protected WriterOptions batchSize(int maxSize) {
+ batchSize = maxSize;
+ return this;
+ }
+
+ int getBatchSize() {
+ return batchSize;
+ }
+ }
}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java
deleted file mode 100644
index 0a4ebc6948..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java
+++ /dev/null
@@ -1,332 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc;
-
-import com.facebook.presto.orc.OrcDataSource;
-import com.facebook.presto.orc.OrcPredicate;
-import com.facebook.presto.orc.OrcReader;
-import com.facebook.presto.orc.OrcRecordReader;
-import com.facebook.presto.orc.memory.AggregatedMemoryContext;
-import com.facebook.presto.orc.metadata.OrcMetadataReader;
-import com.facebook.presto.spi.block.Block;
-import com.facebook.presto.spi.type.*;
-import com.google.protobuf.InvalidProtocolBufferException;
-import io.airlift.units.DataSize;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.tajo.TajoConstants;
-import org.apache.tajo.catalog.Schema;
-import org.apache.tajo.catalog.TableMeta;
-import org.apache.tajo.common.TajoDataTypes;
-import org.apache.tajo.conf.TajoConf;
-import org.apache.tajo.datum.*;
-import org.apache.tajo.exception.NotImplementedException;
-import org.apache.tajo.exception.TajoRuntimeException;
-import org.apache.tajo.plan.expr.EvalNode;
-import org.apache.tajo.storage.FileScanner;
-import org.apache.tajo.storage.StorageConstants;
-import org.apache.tajo.storage.Tuple;
-import org.apache.tajo.storage.VTuple;
-import org.apache.tajo.storage.fragment.Fragment;
-import org.apache.tajo.storage.thirdparty.orc.HdfsOrcDataSource;
-import org.apache.tajo.util.datetime.DateTimeUtil;
-import org.joda.time.DateTimeZone;
-
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.TimeZone;
-
-/**
- * OrcScanner for reading ORC files
- */
-public class ORCScanner extends FileScanner {
- private static final Log LOG = LogFactory.getLog(ORCScanner.class);
- private OrcRecordReader recordReader;
- private Block[] blocks;
- private int currentPosInBatch = 0;
- private int batchSize = 0;
- private Tuple outTuple;
- private AggregatedMemoryContext aggrMemoryContext = new AggregatedMemoryContext();
-
- public ORCScanner(Configuration conf, final Schema schema, final TableMeta meta, final Fragment fragment) {
- super(conf, schema, meta, fragment);
- }
-
- private FileSystem fs;
- private FSDataInputStream fis;
-
- private static class ColumnInfo {
- TajoDataTypes.DataType type;
- int id;
- }
-
- /**
- * Temporary array for caching column info
- */
- private ColumnInfo [] targetColInfo;
-
- @Override
- public void init() throws IOException {
- OrcReader orcReader;
- DataSize maxMergeDistance = new DataSize(Double.parseDouble(meta.getProperty(StorageConstants.ORC_MAX_MERGE_DISTANCE,
- StorageConstants.DEFAULT_ORC_MAX_MERGE_DISTANCE)), DataSize.Unit.BYTE);
- DataSize maxReadSize = new DataSize(Double.parseDouble(meta.getProperty(StorageConstants.ORC_MAX_READ_BUFFER_SIZE,
- StorageConstants.DEFAULT_ORC_MAX_READ_BUFFER_SIZE)), DataSize.Unit.BYTE);
-
- if (targets == null) {
- targets = schema.toArray();
- }
-
- outTuple = new VTuple(targets.length);
-
- Path path = fragment.getPath();
-
- if(fs == null) {
- fs = FileScanner.getFileSystem((TajoConf)conf, path);
- }
-
- if(fis == null) {
- fis = fs.open(path);
- }
-
- OrcDataSource orcDataSource = new HdfsOrcDataSource(
- this.fragment.getPath().toString(),
- fis,
- fs.getFileStatus(path).getLen(),
- maxMergeDistance,
- maxReadSize);
-
- targetColInfo = new ColumnInfo[targets.length];
- for (int i=0; i columnMap = new HashMap<>();
- for (ColumnInfo colInfo: targetColInfo) {
- columnMap.put(colInfo.id, createFBtypeByTajoType(colInfo.type));
- }
-
- orcReader = new OrcReader(orcDataSource, new OrcMetadataReader(), maxMergeDistance, maxReadSize);
-
- TimeZone timezone = TimeZone.getTimeZone(meta.getProperty(StorageConstants.TIMEZONE,
- TajoConstants.DEFAULT_SYSTEM_TIMEZONE));
-
- // TODO: make OrcPredicate useful
- // presto-orc uses joda timezone, so it needs to be converted.
- recordReader = orcReader.createRecordReader(columnMap, OrcPredicate.TRUE,
- fragment.getStartKey(), fragment.getLength(), DateTimeZone.forTimeZone(timezone), aggrMemoryContext);
-
- super.init();
- LOG.debug("file fragment { path: " + fragment.getPath() +
- ", start offset: " + fragment.getStartKey() +
- ", length: " + fragment.getLength() + "}");
- }
-
- @Override
- public Tuple next() throws IOException {
- if (currentPosInBatch == batchSize) {
- getNextBatch();
-
- // EOF
- if (batchSize == -1) {
- return null;
- }
- }
-
- for (int i=0; i stripeStats;
+ private int metadataSize;
+ protected List types;
+ private List userMetadata;
+ private List fileStats;
+ private List stripes;
+ protected int rowIndexStride;
+ private long contentLength, numberOfRows;
+
+ private List versionList;
+
+ //serialized footer - Keeping this around for use by getFileMetaInfo()
+ // will help avoid cpu cycles spend in deserializing at cost of increased
+ // memory footprint.
+ private ByteBuffer footerByteBuffer;
+ // Same for metastore cache - maintains the same background buffer, but includes postscript.
+ // This will only be set if the file footer/metadata was read from disk.
+ private ByteBuffer footerMetaAndPsBuffer;
+
+ private OrcRecordReader recordReader;
+
+ private long recordCount = 0;
+
+ /**
+ * Ensure this is an ORC file to prevent users from trying to read text
+ * files or RC files as ORC files.
+ * @param in the file being read
+ * @param path the filename for error messages
+ * @param psLen the postscript length
+ * @param buffer the tail of the file
+ * @throws IOException
+ */
+ static void ensureOrcFooter(FSDataInputStream in,
+ Path path,
+ int psLen,
+ ByteBuffer buffer) throws IOException {
+ int len = OrcFile.MAGIC.length();
+ if (psLen < len + 1) {
+ throw new IOException("Malformed ORC file " + path +
+ ". Invalid postscript length " + psLen);
+ }
+ int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - 1 - len;
+ byte[] array = buffer.array();
+ // now look for the magic string at the end of the postscript.
+ if (!Text.decode(array, offset, len).equals(OrcFile.MAGIC)) {
+ // If it isn't there, this may be the 0.11.0 version of ORC.
+ // Read the first 3 bytes of the file to check for the header
+ byte[] header = new byte[len];
+ in.readFully(0, header, 0, len);
+ // if it isn't there, this isn't an ORC file
+ if (!Text.decode(header, 0 , len).equals(OrcFile.MAGIC)) {
+ throw new IOException("Malformed ORC file " + path +
+ ". Invalid postscript.");
+ }
+ }
+ }
+
+ /**
+ * Build a version string out of an array.
+ * @param version the version number as a list
+ * @return the human readable form of the version string
+ */
+ private static String versionString(List version) {
+ StringBuilder buffer = new StringBuilder();
+ for(int i=0; i < version.size(); ++i) {
+ if (i != 0) {
+ buffer.append('.');
+ }
+ buffer.append(version.get(i));
+ }
+ return buffer.toString();
+ }
+
+ /**
+ * Check to see if this ORC file is from a future version and if so,
+ * warn the user that we may not be able to read all of the column encodings.
+ * @param log the logger to write any error message to
+ * @param path the data source path for error messages
+ * @param version the version of hive that wrote the file.
+ */
+ static void checkOrcVersion(Log log, Path path, List version) {
+ if (version.size() >= 1) {
+ int major = version.get(0);
+ int minor = 0;
+ if (version.size() >= 2) {
+ minor = version.get(1);
+ }
+ if (major > OrcFile.Version.CURRENT.getMajor() ||
+ (major == OrcFile.Version.CURRENT.getMajor() &&
+ minor > OrcFile.Version.CURRENT.getMinor())) {
+ log.warn(path + " was written by a future Hive version " +
+ versionString(version) +
+ ". This file may not be readable by this version of Hive.");
+ }
+ }
+ }
+
+ public OrcScanner(Configuration conf, Schema schema, TableMeta meta, Fragment fragment) throws IOException {
+ super(conf, schema, meta, fragment);
+
+ this.path = this.fragment.getPath();
+ this.fileSystem = this.path.getFileSystem(conf);
+ }
+
+ private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs,
+ Path path,
+ long maxFileLength
+ ) throws IOException {
+ FSDataInputStream file = fs.open(path);
+
+ // figure out the size of the file using the option or filesystem
+ long size;
+ if (maxFileLength == Long.MAX_VALUE) {
+ size = fs.getFileStatus(path).getLen();
+ } else {
+ size = maxFileLength;
+ }
+
+ //read last bytes into buffer to get PostScript
+ int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS);
+ ByteBuffer buffer = ByteBuffer.allocate(readSize);
+ assert buffer.position() == 0;
+ file.readFully((size - readSize),
+ buffer.array(), buffer.arrayOffset(), readSize);
+ buffer.position(0);
+
+ //read the PostScript
+ //get length of PostScript
+ int psLen = buffer.get(readSize - 1) & 0xff;
+ ensureOrcFooter(file, path, psLen, buffer);
+ int psOffset = readSize - 1 - psLen;
+ OrcProto.PostScript ps = extractPostScript(buffer, path, psLen, psOffset);
+
+ int footerSize = (int) ps.getFooterLength();
+ int metadataSize = (int) ps.getMetadataLength();
+
+ //check if extra bytes need to be read
+ ByteBuffer fullFooterBuffer = null;
+ int extra = Math.max(0, psLen + 1 + footerSize + metadataSize - readSize);
+ if (extra > 0) {
+ //more bytes need to be read, seek back to the right place and read extra bytes
+ ByteBuffer extraBuf = ByteBuffer.allocate(extra + readSize);
+ file.readFully((size - readSize - extra), extraBuf.array(),
+ extraBuf.arrayOffset() + extraBuf.position(), extra);
+ extraBuf.position(extra);
+ //append with already read bytes
+ extraBuf.put(buffer);
+ buffer = extraBuf;
+ buffer.position(0);
+ fullFooterBuffer = buffer.slice();
+ buffer.limit(footerSize + metadataSize);
+ } else {
+ //footer is already in the bytes in buffer, just adjust position, length
+ buffer.position(psOffset - footerSize - metadataSize);
+ fullFooterBuffer = buffer.slice();
+ buffer.limit(psOffset);
+ }
+
+ // remember position for later
+ buffer.mark();
+
+ file.close();
+
+ return new FileMetaInfo(
+ ps.getCompression().toString(),
+ (int) ps.getCompressionBlockSize(),
+ (int) ps.getMetadataLength(),
+ buffer,
+ ps.getVersionList(),
+ org.apache.orc.OrcFile.WriterVersion.FUTURE,
+ fullFooterBuffer
+ );
+ }
+
+ public OrcRecordReader createRecordReader() throws IOException {
+ return new OrcRecordReader(this.stripes, fileSystem, schema, targets, fragment, types, codec, bufferSize,
+ rowIndexStride, buildReaderOptions(meta), conf,
+ TimeZone.getTimeZone(meta.getProperty(StorageConstants.TIMEZONE, TajoConstants.DEFAULT_SYSTEM_TIMEZONE)));
+ }
+
+ private static Options buildReaderOptions(TableMeta meta) {
+ return new Options()
+ .useZeroCopy(Boolean.parseBoolean(meta.getProperty(OrcConf.USE_ZEROCOPY.getAttribute(),
+ String.valueOf(OrcConf.USE_ZEROCOPY.getDefaultValue()))))
+ .skipCorruptRecords(Boolean.parseBoolean(meta.getProperty(OrcConf.SKIP_CORRUPT_DATA.getAttribute(),
+ String.valueOf(OrcConf.SKIP_CORRUPT_DATA.getDefaultValue()))));
+ }
+
+ @Override
+ public void init() throws IOException {
+ FileMetaInfo footerMetaData = extractMetaInfoFromFooter(fileSystem, path, maxLength);
+ this.footerMetaAndPsBuffer = footerMetaData.footerMetaAndPsBuffer;
+ MetaInfoObjExtractor rInfo =
+ new MetaInfoObjExtractor(footerMetaData.compressionType,
+ footerMetaData.bufferSize,
+ footerMetaData.metadataSize,
+ footerMetaData.footerBuffer
+ );
+ this.footerByteBuffer = footerMetaData.footerBuffer;
+ this.compressionKind = rInfo.compressionKind;
+ this.codec = rInfo.codec;
+ this.bufferSize = rInfo.bufferSize;
+ this.metadataSize = rInfo.metadataSize;
+ this.stripeStats = rInfo.metadata.getStripeStatsList();
+ this.types = rInfo.footer.getTypesList();
+ this.rowIndexStride = rInfo.footer.getRowIndexStride();
+ this.contentLength = rInfo.footer.getContentLength();
+ this.numberOfRows = rInfo.footer.getNumberOfRows();
+ this.userMetadata = rInfo.footer.getMetadataList();
+ this.fileStats = rInfo.footer.getStatisticsList();
+ this.versionList = footerMetaData.versionList;
+ this.stripes = convertProtoStripesToStripes(rInfo.footer.getStripesList());
+
+ recordReader = createRecordReader();
+
+ super.init();
+ }
+
+ @Override
+ public Tuple next() throws IOException {
+ Tuple next = recordReader.next();
+ if (next != null) {
+ recordCount++;
+ }
+ return next;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ // TODO: improve this
+ this.close();
+ recordReader = createRecordReader();
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (recordReader != null) {
+ recordReader.close();
+ tableStats.setNumBytes(recordReader.getNumBytes());
+ tableStats.setNumRows(recordCount);
+ }
+ }
+
+ @Override
+ public boolean isProjectable() {
+ return true;
+ }
+
+ @Override
+ public boolean isSelectable() {
+ return false;
+ }
+
+ @Override
+ public void setFilter(EvalNode filter) {
+ // TODO: implement this
+ }
+
+ @Override
+ public float getProgress() {
+ return inited ? recordReader.getProgress() : super.getProgress();
+ }
+
+ @Override
+ public boolean isSplittable() {
+ return true;
+ }
+
+ private static OrcProto.PostScript extractPostScript(ByteBuffer bb, Path path,
+ int psLen, int psAbsOffset) throws IOException {
+ // TODO: when PB is upgraded to 2.6, newInstance(ByteBuffer) method should be used here.
+ assert bb.hasArray();
+ CodedInputStream in = CodedInputStream.newInstance(
+ bb.array(), bb.arrayOffset() + psAbsOffset, psLen);
+ OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(in);
+ checkOrcVersion(LOG, path, ps.getVersionList());
+
+ // Check compression codec.
+ switch (ps.getCompression()) {
+ case NONE:
+ break;
+ case ZLIB:
+ break;
+ case SNAPPY:
+ break;
+ case LZO:
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown compression");
+ }
+ return ps;
+ }
+
+ private static OrcProto.Footer extractFooter(ByteBuffer bb, int footerAbsPos,
+ int footerSize, CompressionCodec codec, int bufferSize) throws IOException {
+ bb.position(footerAbsPos);
+ bb.limit(footerAbsPos + footerSize);
+ return OrcProto.Footer.parseFrom(InStream.createCodedInputStream("footer",
+ Lists.newArrayList(new BufferChunk(bb, 0)), footerSize, codec, bufferSize));
+ }
+
+ private static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos,
+ int metadataSize, CompressionCodec codec, int bufferSize) throws IOException {
+ bb.position(metadataAbsPos);
+ bb.limit(metadataAbsPos + metadataSize);
+ return OrcProto.Metadata.parseFrom(InStream.createCodedInputStream("metadata",
+ Lists.newArrayList(new BufferChunk(bb, 0)), metadataSize, codec, bufferSize));
+ }
+
+ /**
+ * MetaInfoObjExtractor - has logic to create the values for the fields in ReaderImpl
+ * from serialized fields.
+ * As the fields are final, the fields need to be initialized in the constructor and
+ * can't be done in some helper function. So this helper class is used instead.
+ *
+ */
+ private static class MetaInfoObjExtractor{
+ final org.apache.orc.CompressionKind compressionKind;
+ final CompressionCodec codec;
+ final int bufferSize;
+ final int metadataSize;
+ final OrcProto.Metadata metadata;
+ final OrcProto.Footer footer;
+
+ MetaInfoObjExtractor(String codecStr, int bufferSize, int metadataSize,
+ ByteBuffer footerBuffer) throws IOException {
+
+ this.compressionKind = org.apache.orc.CompressionKind.valueOf(codecStr);
+ this.bufferSize = bufferSize;
+ this.codec = OrcUtils.createCodec(compressionKind);
+ this.metadataSize = metadataSize;
+
+ int position = footerBuffer.position();
+ int footerBufferSize = footerBuffer.limit() - footerBuffer.position() - metadataSize;
+
+ this.metadata = extractMetadata(footerBuffer, position, metadataSize, codec, bufferSize);
+ this.footer = extractFooter(
+ footerBuffer, position + metadataSize, footerBufferSize, codec, bufferSize);
+
+ footerBuffer.position(position);
+ }
+ }
+
+ public static class StripeInformationImpl
+ implements org.apache.orc.StripeInformation {
+ private final OrcProto.StripeInformation stripe;
+
+ public StripeInformationImpl(OrcProto.StripeInformation stripe) {
+ this.stripe = stripe;
+ }
+
+ @Override
+ public long getOffset() {
+ return stripe.getOffset();
+ }
+
+ @Override
+ public long getLength() {
+ return stripe.getDataLength() + getIndexLength() + getFooterLength();
+ }
+
+ @Override
+ public long getDataLength() {
+ return stripe.getDataLength();
+ }
+
+ @Override
+ public long getFooterLength() {
+ return stripe.getFooterLength();
+ }
+
+ @Override
+ public long getIndexLength() {
+ return stripe.getIndexLength();
+ }
+
+ @Override
+ public long getNumberOfRows() {
+ return stripe.getNumberOfRows();
+ }
+
+ @Override
+ public String toString() {
+ return "offset: " + getOffset() + " data: " + getDataLength() +
+ " rows: " + getNumberOfRows() + " tail: " + getFooterLength() +
+ " index: " + getIndexLength();
+ }
+ }
+
+ private static List convertProtoStripesToStripes(
+ List stripes) {
+ List result = new ArrayList<>(stripes.size());
+ for (OrcProto.StripeInformation info : stripes) {
+ result.add(new StripeInformationImpl(info));
+ }
+ return result;
+ }
+
+}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java
deleted file mode 100644
index 061ba0d034..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java
+++ /dev/null
@@ -1,91 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
-import org.apache.tajo.catalog.Schema;
-import org.apache.tajo.common.TajoDataTypes;
-import org.apache.tajo.exception.UnsupportedException;
-
-public class ObjectInspectorFactory {
-
- public static StructObjectInspector buildStructObjectInspector(Schema schema) {
- StructObjectInspector structOI = new TajoStructObjectInspector(schema);
- return structOI;
- }
-
- public static ObjectInspector buildObjectInspectorByType(TajoDataTypes.Type dataType) throws UnsupportedException {
- ObjectInspector oi = null;
-
- switch(dataType) {
- case BOOLEAN:
- oi = new TajoBooleanObjectInspector();
- break;
-
- case INT2:
- oi = new TajoShortObjectInspector();
- break;
-
- case INET4:
- case INT4:
- oi = new TajoIntObjectInspector();
- break;
-
- case INT8:
- oi = new TajoLongObjectInspector();
- break;
-
- case FLOAT4:
- oi = new TajoFloatObjectInspector();
- break;
-
- case FLOAT8:
- oi = new TajoDoubleObjectInspector();
- break;
-
- case TEXT:
- case CHAR:
- oi = new TajoStringObjectInspector();
- break;
-
- case TIMESTAMP:
- oi = new TajoTimestampObjectInspector();
- break;
-
- case DATE:
- oi = new TajoDateObjectInspector();
- break;
-
- case BLOB:
- case PROTOBUF:
- oi = new TajoBlobObjectInspector();
- break;
-
- case NULL_TYPE:
- oi = new TajoNullObjectInspector();
- break;
-
- default:
- throw new UnsupportedException(dataType.name()+" is not supported yet in OrcAppender");
- }
-
- return oi;
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java
deleted file mode 100644
index d241f84371..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-import org.apache.hadoop.io.BytesWritable;
-import org.apache.tajo.datum.Datum;
-
-public class TajoBlobObjectInspector extends TajoPrimitiveObjectInspector implements BinaryObjectInspector {
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.binaryTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.BINARY;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public BytesWritable getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return byte [].class;
- }
-
- @Override
- public byte[] getPrimitiveJavaObject(Object o) {
- return ((Datum)o).asByteArray();
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public int precision() {
- return 0;
- }
-
- @Override
- public int scale() {
- return 0;
- }
-
- @Override
- public String getTypeName() {
- return "BINARY";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java
deleted file mode 100644
index 273505f0cb..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-import org.apache.tajo.datum.Datum;
-
-public class TajoBooleanObjectInspector extends TajoPrimitiveObjectInspector implements BooleanObjectInspector {
- @Override
- public boolean get(Object o) {
- return ((Datum)o).asBool();
- }
-
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.booleanTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.BOOLEAN;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public Object getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return Boolean.class;
- }
-
- @Override
- public Object getPrimitiveJavaObject(Object o) {
- return null;
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public String getTypeName() {
- return "BOOLEAN";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDateObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDateObjectInspector.java
deleted file mode 100644
index f12706b8df..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDateObjectInspector.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-
-import java.sql.Date;
-
-public class TajoDateObjectInspector extends TajoPrimitiveObjectInspector implements DateObjectInspector {
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.dateTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.DATE;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public DateWritable getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return null;
- }
-
- @Override
- public Date getPrimitiveJavaObject(Object o) {
- return null;
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public String getTypeName() {
- return "DATE";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java
deleted file mode 100644
index 6dc1f8c95c..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-import org.apache.tajo.datum.Float8Datum;
-
-public class TajoDoubleObjectInspector extends TajoPrimitiveObjectInspector implements DoubleObjectInspector {
- @Override
- public double get(Object o) {
- return ((Float8Datum)o).asFloat8();
- }
-
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.doubleTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.DOUBLE;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public Object getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return Double.class;
- }
-
- @Override
- public Object getPrimitiveJavaObject(Object o) {
- return null;
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public String getTypeName() {
- return "DOUBLE";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java
deleted file mode 100644
index bed8784fb5..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-import org.apache.tajo.datum.Float4Datum;
-
-public class TajoFloatObjectInspector extends TajoPrimitiveObjectInspector implements DoubleObjectInspector {
- @Override
- public double get(Object o) {
- return ((Float4Datum)o).asFloat4();
- }
-
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.floatTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.FLOAT;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public Object getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return Float.class;
- }
-
- @Override
- public Object getPrimitiveJavaObject(Object o) {
- return null;
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public String getTypeName() {
- return "FLOAT";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java
deleted file mode 100644
index a0c2209678..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-import org.apache.tajo.datum.Int4Datum;
-
-public class TajoIntObjectInspector extends TajoPrimitiveObjectInspector implements IntObjectInspector {
- @Override
- public int get(Object o) {
- return ((Int4Datum)o).asInt4();
- }
-
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.intTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.INT;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public Object getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return Integer.class;
- }
-
- @Override
- public Object getPrimitiveJavaObject(Object o) {
- return null;
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public String getTypeName() {
- return "INT";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java
deleted file mode 100644
index b30b3338f6..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-import org.apache.tajo.datum.Int8Datum;
-
-public class TajoLongObjectInspector extends TajoPrimitiveObjectInspector implements LongObjectInspector {
- @Override
- public long get(Object o) {
- return ((Int8Datum)o).asInt8();
- }
-
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.shortTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.LONG;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public Object getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return Long.class;
- }
-
- @Override
- public Object getPrimitiveJavaObject(Object o) {
- return null;
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public String getTypeName() {
- return "LONG";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoNullObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoNullObjectInspector.java
deleted file mode 100644
index 49998ce30e..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoNullObjectInspector.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-
-public class TajoNullObjectInspector extends TajoPrimitiveObjectInspector {
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.voidTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.VOID;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public Object getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return Void.class;
- }
-
- @Override
- public Object getPrimitiveJavaObject(Object o) {
- return null;
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public String getTypeName() {
- return "NULL";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoPrimitiveObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoPrimitiveObjectInspector.java
deleted file mode 100644
index 90ac178fdd..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoPrimitiveObjectInspector.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
-
-public abstract class TajoPrimitiveObjectInspector implements PrimitiveObjectInspector {
- @Override
- public Category getCategory() {
- return Category.PRIMITIVE;
- }
-
- @Override
- public int precision() {
- return 0;
- }
-
- @Override
- public int scale() {
- return 0;
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java
deleted file mode 100644
index d32bee172a..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-import org.apache.tajo.datum.Int2Datum;
-
-public class TajoShortObjectInspector extends TajoPrimitiveObjectInspector implements ShortObjectInspector {
- @Override
- public short get(Object o) {
- return ((Int2Datum)o).asInt2();
- }
-
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.shortTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.SHORT;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public Object getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return Short.class;
- }
-
- @Override
- public Object getPrimitiveJavaObject(Object o) {
- return null;
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public String getTypeName() {
- return "SHORT";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java
deleted file mode 100644
index b9331da6cd..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-import org.apache.hadoop.io.Text;
-
-public class TajoStringObjectInspector extends TajoPrimitiveObjectInspector implements StringObjectInspector {
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.stringTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.STRING;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public Text getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return null;
- }
-
- @Override
- public String getPrimitiveJavaObject(Object o) {
- return null;
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public String getTypeName() {
- return "STRING";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java
deleted file mode 100644
index 7521fa32c6..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java
+++ /dev/null
@@ -1,122 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.StructField;
-import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
-import org.apache.tajo.catalog.Column;
-import org.apache.tajo.catalog.Schema;
-import org.apache.tajo.exception.UnsupportedException;
-
-import java.util.ArrayList;
-import java.util.List;
-
-public class TajoStructObjectInspector extends StructObjectInspector {
- private final static Log LOG = LogFactory.getLog(TajoStructObjectInspector.class);
- private List structFields;
-
- static class TajoStructField implements StructField {
- private String name;
- private ObjectInspector oi;
- private String comment;
-
- TajoStructField(String name, ObjectInspector oi) {
- this(name, oi, null);
- }
-
- TajoStructField(String name, ObjectInspector oi, String comment) {
- this.name = name;
- this.oi = oi;
- this.comment = comment;
- }
-
- @Override
- public String getFieldName() {
- return name;
- }
-
- @Override
- public ObjectInspector getFieldObjectInspector() {
- return oi;
- }
-
- @Override
- public int getFieldID() {
- return 0;
- }
-
- @Override
- public String getFieldComment() {
- return comment;
- }
- }
-
- TajoStructObjectInspector(Schema schema) {
- structFields = new ArrayList<>(schema.size());
-
- for (Column c: schema.getRootColumns()) {
- try {
- TajoStructField field = new TajoStructField(c.getSimpleName(),
- ObjectInspectorFactory.buildObjectInspectorByType(c.getDataType().getType()));
- structFields.add(field);
- } catch (UnsupportedException e) {
- LOG.error(e.getMessage());
- }
- }
- }
-
- @Override
- public List extends StructField> getAllStructFieldRefs() {
- return structFields;
- }
-
- @Override
- public StructField getStructFieldRef(String s) {
- for (TajoStructField field:structFields) {
- if (field.getFieldName().equals(s)) {
- return field;
- }
- }
-
- return null;
- }
-
- @Override
- public Object getStructFieldData(Object o, StructField structField) {
- return null;
- }
-
- @Override
- public List
- * The description and format for these types are as below:
- *
- * SHORT_REPEAT: Used for short repeated integer sequences.
- *
- *
1 byte header
- *
- *
2 bits for encoding type
- *
3 bits for bytes required for repeating value
- *
3 bits for repeat count (MIN_REPEAT + run length)
- *
- *
- *
Blob - repeat value (fixed bytes)
- *
- *
- *
- * DIRECT: Used for random integer sequences whose number of bit
- * requirement doesn't vary a lot.
- *
- *
2 bytes header
- *
- * 1st byte
- *
2 bits for encoding type
- *
5 bits for fixed bit width of values in blob
- *
1 bit for storing MSB of run length
- *
- *
- * 2nd byte
- *
8 bits for lower run length bits
- *
- *
- *
Blob - stores the direct values using fixed bit width. The length of the
- * data blob is (fixed width * run length) bits long
- *
- *
- *
- * PATCHED_BASE: Used for random integer sequences whose number of bit
- * requirement varies beyond a threshold.
- *
- *
4 bytes header
- *
- * 1st byte
- *
2 bits for encoding type
- *
5 bits for fixed bit width of values in blob
- *
1 bit for storing MSB of run length
- *
- *
- * 2nd byte
- *
8 bits for lower run length bits
- *
- *
- * 3rd byte
- *
3 bits for bytes required to encode base value
- *
5 bits for patch width
- *
- *
- * 4th byte
- *
3 bits for patch gap width
- *
5 bits for patch length
- *
- *
- *
Base value - Stored using fixed number of bytes. If MSB is set, base
- * value is negative else positive. Length of base value is (base width * 8)
- * bits.
- *
Data blob - Base reduced values as stored using fixed bit width. Length
- * of data blob is (fixed width * run length) bits.
- *
Patch blob - Patch blob is a list of gap and patch value. Each entry in
- * the patch list is (patch width + patch gap width) bits long. Gap between the
- * subsequent elements to be patched are stored in upper part of entry whereas
- * patch values are stored in lower part of entry. Length of patch blob is
- * ((patch width + patch gap width) * patch length) bits.
- *
- *
- *
- * DELTA Used for monotonically increasing or decreasing sequences,
- * sequences with fixed delta values or long repeated sequences.
- *
- *
2 bytes header
- *
- * 1st byte
- *
2 bits for encoding type
- *
5 bits for fixed bit width of values in blob
- *
1 bit for storing MSB of run length
- *
- *
- * 2nd byte
- *
8 bits for lower run length bits
- *
- *
- *
Base value - encoded as varint
- *
Delta base - encoded as varint
- *
Delta blob - only positive values. monotonicity and orderness are decided
- * based on the sign of the base value and delta base
- *
- *
- */
-class RunLengthIntegerWriterV2 implements IntegerWriter {
-
- public enum EncodingType {
- SHORT_REPEAT, DIRECT, PATCHED_BASE, DELTA
- }
-
- static final int MAX_SCOPE = 512;
- static final int MIN_REPEAT = 3;
- private static final int MAX_SHORT_REPEAT_LENGTH = 10;
- private long prevDelta = 0;
- private int fixedRunLength = 0;
- private int variableRunLength = 0;
- private final long[] literals = new long[MAX_SCOPE];
- private final PositionedOutputStream output;
- private final boolean signed;
- private EncodingType encoding;
- private int numLiterals;
- private final long[] zigzagLiterals = new long[MAX_SCOPE];
- private final long[] baseRedLiterals = new long[MAX_SCOPE];
- private final long[] adjDeltas = new long[MAX_SCOPE];
- private long fixedDelta;
- private int zzBits90p;
- private int zzBits100p;
- private int brBits95p;
- private int brBits100p;
- private int bitsDeltaMax;
- private int patchWidth;
- private int patchGapWidth;
- private int patchLength;
- private long[] gapVsPatchList;
- private long min;
- private boolean isFixedDelta;
- private SerializationUtils utils;
- private boolean alignedBitpacking;
-
- RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed) {
- this(output, signed, true);
- }
-
- RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed,
- boolean alignedBitpacking) {
- this.output = output;
- this.signed = signed;
- this.alignedBitpacking = alignedBitpacking;
- this.utils = new SerializationUtils();
- clear();
- }
-
- private void writeValues() throws IOException {
- if (numLiterals != 0) {
-
- if (encoding.equals(EncodingType.SHORT_REPEAT)) {
- writeShortRepeatValues();
- } else if (encoding.equals(EncodingType.DIRECT)) {
- writeDirectValues();
- } else if (encoding.equals(EncodingType.PATCHED_BASE)) {
- writePatchedBaseValues();
- } else {
- writeDeltaValues();
- }
-
- // clear all the variables
- clear();
- }
- }
-
- private void writeDeltaValues() throws IOException {
- int len = 0;
- int fb = bitsDeltaMax;
- int efb = 0;
-
- if (alignedBitpacking) {
- fb = utils.getClosestAlignedFixedBits(fb);
- }
-
- if (isFixedDelta) {
- // if fixed run length is greater than threshold then it will be fixed
- // delta sequence with delta value 0 else fixed delta sequence with
- // non-zero delta value
- if (fixedRunLength > MIN_REPEAT) {
- // ex. sequence: 2 2 2 2 2 2 2 2
- len = fixedRunLength - 1;
- fixedRunLength = 0;
- } else {
- // ex. sequence: 4 6 8 10 12 14 16
- len = variableRunLength - 1;
- variableRunLength = 0;
- }
- } else {
- // fixed width 0 is used for long repeating values.
- // sequences that require only 1 bit to encode will have an additional bit
- if (fb == 1) {
- fb = 2;
- }
- efb = utils.encodeBitWidth(fb);
- efb = efb << 1;
- len = variableRunLength - 1;
- variableRunLength = 0;
- }
-
- // extract the 9th bit of run length
- final int tailBits = (len & 0x100) >>> 8;
-
- // create first byte of the header
- final int headerFirstByte = getOpcode() | efb | tailBits;
-
- // second byte of the header stores the remaining 8 bits of runlength
- final int headerSecondByte = len & 0xff;
-
- // write header
- output.write(headerFirstByte);
- output.write(headerSecondByte);
-
- // store the first value from zigzag literal array
- if (signed) {
- utils.writeVslong(output, literals[0]);
- } else {
- utils.writeVulong(output, literals[0]);
- }
-
- if (isFixedDelta) {
- // if delta is fixed then we don't need to store delta blob
- utils.writeVslong(output, fixedDelta);
- } else {
- // store the first value as delta value using zigzag encoding
- utils.writeVslong(output, adjDeltas[0]);
-
- // adjacent delta values are bit packed. The length of adjDeltas array is
- // always one less than the number of literals (delta difference for n
- // elements is n-1). We have already written one element, write the
- // remaining numLiterals - 2 elements here
- utils.writeInts(adjDeltas, 1, numLiterals - 2, fb, output);
- }
- }
-
- private void writePatchedBaseValues() throws IOException {
-
- // NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding
- // because patch is applied to MSB bits. For example: If fixed bit width of
- // base value is 7 bits and if patch is 3 bits, the actual value is
- // constructed by shifting the patch to left by 7 positions.
- // actual_value = patch << 7 | base_value
- // So, if we align base_value then actual_value can not be reconstructed.
-
- // write the number of fixed bits required in next 5 bits
- final int fb = brBits95p;
- final int efb = utils.encodeBitWidth(fb) << 1;
-
- // adjust variable run length, they are one off
- variableRunLength -= 1;
-
- // extract the 9th bit of run length
- final int tailBits = (variableRunLength & 0x100) >>> 8;
-
- // create first byte of the header
- final int headerFirstByte = getOpcode() | efb | tailBits;
-
- // second byte of the header stores the remaining 8 bits of runlength
- final int headerSecondByte = variableRunLength & 0xff;
-
- // if the min value is negative toggle the sign
- final boolean isNegative = min < 0 ? true : false;
- if (isNegative) {
- min = -min;
- }
-
- // find the number of bytes required for base and shift it by 5 bits
- // to accommodate patch width. The additional bit is used to store the sign
- // of the base value.
- final int baseWidth = utils.findClosestNumBits(min) + 1;
- final int baseBytes = baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1;
- final int bb = (baseBytes - 1) << 5;
-
- // if the base value is negative then set MSB to 1
- if (isNegative) {
- min |= (1L << ((baseBytes * 8) - 1));
- }
-
- // third byte contains 3 bits for number of bytes occupied by base
- // and 5 bits for patchWidth
- final int headerThirdByte = bb | utils.encodeBitWidth(patchWidth);
-
- // fourth byte contains 3 bits for page gap width and 5 bits for
- // patch length
- final int headerFourthByte = (patchGapWidth - 1) << 5 | patchLength;
-
- // write header
- output.write(headerFirstByte);
- output.write(headerSecondByte);
- output.write(headerThirdByte);
- output.write(headerFourthByte);
-
- // write the base value using fixed bytes in big endian order
- for(int i = baseBytes - 1; i >= 0; i--) {
- byte b = (byte) ((min >>> (i * 8)) & 0xff);
- output.write(b);
- }
-
- // base reduced literals are bit packed
- int closestFixedBits = utils.getClosestFixedBits(fb);
-
- utils.writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits,
- output);
-
- // write patch list
- closestFixedBits = utils.getClosestFixedBits(patchGapWidth + patchWidth);
-
- utils.writeInts(gapVsPatchList, 0, gapVsPatchList.length, closestFixedBits,
- output);
-
- // reset run length
- variableRunLength = 0;
- }
-
- /**
- * Store the opcode in 2 MSB bits
- * @return opcode
- */
- private int getOpcode() {
- return encoding.ordinal() << 6;
- }
-
- private void writeDirectValues() throws IOException {
-
- // write the number of fixed bits required in next 5 bits
- int fb = zzBits100p;
-
- if (alignedBitpacking) {
- fb = utils.getClosestAlignedFixedBits(fb);
- }
-
- final int efb = utils.encodeBitWidth(fb) << 1;
-
- // adjust variable run length
- variableRunLength -= 1;
-
- // extract the 9th bit of run length
- final int tailBits = (variableRunLength & 0x100) >>> 8;
-
- // create first byte of the header
- final int headerFirstByte = getOpcode() | efb | tailBits;
-
- // second byte of the header stores the remaining 8 bits of runlength
- final int headerSecondByte = variableRunLength & 0xff;
-
- // write header
- output.write(headerFirstByte);
- output.write(headerSecondByte);
-
- // bit packing the zigzag encoded literals
- utils.writeInts(zigzagLiterals, 0, numLiterals, fb, output);
-
- // reset run length
- variableRunLength = 0;
- }
-
- private void writeShortRepeatValues() throws IOException {
- // get the value that is repeating, compute the bits and bytes required
- long repeatVal = 0;
- if (signed) {
- repeatVal = utils.zigzagEncode(literals[0]);
- } else {
- repeatVal = literals[0];
- }
-
- final int numBitsRepeatVal = utils.findClosestNumBits(repeatVal);
- final int numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? numBitsRepeatVal >>> 3
- : (numBitsRepeatVal >>> 3) + 1;
-
- // write encoding type in top 2 bits
- int header = getOpcode();
-
- // write the number of bytes required for the value
- header |= ((numBytesRepeatVal - 1) << 3);
-
- // write the run length
- fixedRunLength -= MIN_REPEAT;
- header |= fixedRunLength;
-
- // write the header
- output.write(header);
-
- // write the repeating value in big endian byte order
- for(int i = numBytesRepeatVal - 1; i >= 0; i--) {
- int b = (int) ((repeatVal >>> (i * 8)) & 0xff);
- output.write(b);
- }
-
- fixedRunLength = 0;
- }
-
- private void determineEncoding() {
-
- // we need to compute zigzag values for DIRECT encoding if we decide to
- // break early for delta overflows or for shorter runs
- computeZigZagLiterals();
-
- zzBits100p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
-
- // not a big win for shorter runs to determine encoding
- if (numLiterals <= MIN_REPEAT) {
- encoding = EncodingType.DIRECT;
- return;
- }
-
- // DELTA encoding check
-
- // for identifying monotonic sequences
- boolean isIncreasing = true;
- boolean isDecreasing = true;
- this.isFixedDelta = true;
-
- this.min = literals[0];
- long max = literals[0];
- final long initialDelta = literals[1] - literals[0];
- long currDelta = initialDelta;
- long deltaMax = initialDelta;
- this.adjDeltas[0] = initialDelta;
-
- for (int i = 1; i < numLiterals; i++) {
- final long l1 = literals[i];
- final long l0 = literals[i - 1];
- currDelta = l1 - l0;
- min = Math.min(min, l1);
- max = Math.max(max, l1);
-
- isIncreasing &= (l0 <= l1);
- isDecreasing &= (l0 >= l1);
-
- isFixedDelta &= (currDelta == initialDelta);
- if (i > 1) {
- adjDeltas[i - 1] = Math.abs(currDelta);
- deltaMax = Math.max(deltaMax, adjDeltas[i - 1]);
- }
- }
-
- // its faster to exit under delta overflow condition without checking for
- // PATCHED_BASE condition as encoding using DIRECT is faster and has less
- // overhead than PATCHED_BASE
- if (!utils.isSafeSubtract(max, min)) {
- encoding = EncodingType.DIRECT;
- return;
- }
-
- // invariant - subtracting any number from any other in the literals after
- // this point won't overflow
-
- // if initialDelta is 0 then we cannot delta encode as we cannot identify
- // the sign of deltas (increasing or decreasing)
- if (initialDelta != 0) {
-
- // if min is equal to max then the delta is 0, this condition happens for
- // fixed values run >10 which cannot be encoded with SHORT_REPEAT
- if (min == max) {
- assert isFixedDelta : min + "==" + max +
- ", isFixedDelta cannot be false";
- assert currDelta == 0 : min + "==" + max + ", currDelta should be zero";
- fixedDelta = 0;
- encoding = EncodingType.DELTA;
- return;
- }
-
- if (isFixedDelta) {
- assert currDelta == initialDelta
- : "currDelta should be equal to initialDelta for fixed delta encoding";
- encoding = EncodingType.DELTA;
- fixedDelta = currDelta;
- return;
- }
-
- // stores the number of bits required for packing delta blob in
- // delta encoding
- bitsDeltaMax = utils.findClosestNumBits(deltaMax);
-
- // monotonic condition
- if (isIncreasing || isDecreasing) {
- encoding = EncodingType.DELTA;
- return;
- }
- }
-
- // PATCHED_BASE encoding check
-
- // percentile values are computed for the zigzag encoded values. if the
- // number of bit requirement between 90th and 100th percentile varies
- // beyond a threshold then we need to patch the values. if the variation
- // is not significant then we can use direct encoding
-
- zzBits90p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 0.9);
- int diffBitsLH = zzBits100p - zzBits90p;
-
- // if the difference between 90th percentile and 100th percentile fixed
- // bits is > 1 then we need patch the values
- if (diffBitsLH > 1) {
-
- // patching is done only on base reduced values.
- // remove base from literals
- for (int i = 0; i < numLiterals; i++) {
- baseRedLiterals[i] = literals[i] - min;
- }
-
- // 95th percentile width is used to determine max allowed value
- // after which patching will be done
- brBits95p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 0.95);
-
- // 100th percentile is used to compute the max patch width
- brBits100p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 1.0);
-
- // after base reducing the values, if the difference in bits between
- // 95th percentile and 100th percentile value is zero then there
- // is no point in patching the values, in which case we will
- // fallback to DIRECT encoding.
- // The decision to use patched base was based on zigzag values, but the
- // actual patching is done on base reduced literals.
- if ((brBits100p - brBits95p) != 0) {
- encoding = EncodingType.PATCHED_BASE;
- preparePatchedBlob();
- return;
- } else {
- encoding = EncodingType.DIRECT;
- return;
- }
- } else {
- // if difference in bits between 95th percentile and 100th percentile is
- // 0, then patch length will become 0. Hence we will fallback to direct
- encoding = EncodingType.DIRECT;
- return;
- }
- }
-
- private void computeZigZagLiterals() {
- // populate zigzag encoded literals
- long zzEncVal = 0;
- for (int i = 0; i < numLiterals; i++) {
- if (signed) {
- zzEncVal = utils.zigzagEncode(literals[i]);
- } else {
- zzEncVal = literals[i];
- }
- zigzagLiterals[i] = zzEncVal;
- }
- }
-
- private void preparePatchedBlob() {
- // mask will be max value beyond which patch will be generated
- long mask = (1L << brBits95p) - 1;
-
- // since we are considering only 95 percentile, the size of gap and
- // patch array can contain only be 5% values
- patchLength = (int) Math.ceil((numLiterals * 0.05));
-
- int[] gapList = new int[patchLength];
- long[] patchList = new long[patchLength];
-
- // #bit for patch
- patchWidth = brBits100p - brBits95p;
- patchWidth = utils.getClosestFixedBits(patchWidth);
-
- // if patch bit requirement is 64 then it will not possible to pack
- // gap and patch together in a long. To make sure gap and patch can be
- // packed together adjust the patch width
- if (patchWidth == 64) {
- patchWidth = 56;
- brBits95p = 8;
- mask = (1L << brBits95p) - 1;
- }
-
- int gapIdx = 0;
- int patchIdx = 0;
- int prev = 0;
- int gap = 0;
- int maxGap = 0;
-
- for(int i = 0; i < numLiterals; i++) {
- // if value is above mask then create the patch and record the gap
- if (baseRedLiterals[i] > mask) {
- gap = i - prev;
- if (gap > maxGap) {
- maxGap = gap;
- }
-
- // gaps are relative, so store the previous patched value index
- prev = i;
- gapList[gapIdx++] = gap;
-
- // extract the most significant bits that are over mask bits
- long patch = baseRedLiterals[i] >>> brBits95p;
- patchList[patchIdx++] = patch;
-
- // strip off the MSB to enable safe bit packing
- baseRedLiterals[i] &= mask;
- }
- }
-
- // adjust the patch length to number of entries in gap list
- patchLength = gapIdx;
-
- // if the element to be patched is the first and only element then
- // max gap will be 0, but to store the gap as 0 we need atleast 1 bit
- if (maxGap == 0 && patchLength != 0) {
- patchGapWidth = 1;
- } else {
- patchGapWidth = utils.findClosestNumBits(maxGap);
- }
-
- // special case: if the patch gap width is greater than 256, then
- // we need 9 bits to encode the gap width. But we only have 3 bits in
- // header to record the gap width. To deal with this case, we will save
- // two entries in patch list in the following way
- // 256 gap width => 0 for patch value
- // actual gap - 256 => actual patch value
- // We will do the same for gap width = 511. If the element to be patched is
- // the last element in the scope then gap width will be 511. In this case we
- // will have 3 entries in the patch list in the following way
- // 255 gap width => 0 for patch value
- // 255 gap width => 0 for patch value
- // 1 gap width => actual patch value
- if (patchGapWidth > 8) {
- patchGapWidth = 8;
- // for gap = 511, we need two additional entries in patch list
- if (maxGap == 511) {
- patchLength += 2;
- } else {
- patchLength += 1;
- }
- }
-
- // create gap vs patch list
- gapIdx = 0;
- patchIdx = 0;
- gapVsPatchList = new long[patchLength];
- for(int i = 0; i < patchLength; i++) {
- long g = gapList[gapIdx++];
- long p = patchList[patchIdx++];
- while (g > 255) {
- gapVsPatchList[i++] = (255L << patchWidth);
- g -= 255;
- }
-
- // store patch value in LSBs and gap in MSBs
- gapVsPatchList[i] = (g << patchWidth) | p;
- }
- }
-
- /**
- * clears all the variables
- */
- private void clear() {
- numLiterals = 0;
- encoding = null;
- prevDelta = 0;
- fixedDelta = 0;
- zzBits90p = 0;
- zzBits100p = 0;
- brBits95p = 0;
- brBits100p = 0;
- bitsDeltaMax = 0;
- patchGapWidth = 0;
- patchLength = 0;
- patchWidth = 0;
- gapVsPatchList = null;
- min = 0;
- isFixedDelta = true;
- }
-
- @Override
- public void flush() throws IOException {
- if (numLiterals != 0) {
- if (variableRunLength != 0) {
- determineEncoding();
- writeValues();
- } else if (fixedRunLength != 0) {
- if (fixedRunLength < MIN_REPEAT) {
- variableRunLength = fixedRunLength;
- fixedRunLength = 0;
- determineEncoding();
- writeValues();
- } else if (fixedRunLength >= MIN_REPEAT
- && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
- encoding = EncodingType.SHORT_REPEAT;
- writeValues();
- } else {
- encoding = EncodingType.DELTA;
- isFixedDelta = true;
- writeValues();
- }
- }
- }
- output.flush();
- }
-
- @Override
- public void write(long val) throws IOException {
- if (numLiterals == 0) {
- initializeLiterals(val);
- } else {
- if (numLiterals == 1) {
- prevDelta = val - literals[0];
- literals[numLiterals++] = val;
- // if both values are same count as fixed run else variable run
- if (val == literals[0]) {
- fixedRunLength = 2;
- variableRunLength = 0;
- } else {
- fixedRunLength = 0;
- variableRunLength = 2;
- }
- } else {
- long currentDelta = val - literals[numLiterals - 1];
- if (prevDelta == 0 && currentDelta == 0) {
- // fixed delta run
-
- literals[numLiterals++] = val;
-
- // if variable run is non-zero then we are seeing repeating
- // values at the end of variable run in which case keep
- // updating variable and fixed runs
- if (variableRunLength > 0) {
- fixedRunLength = 2;
- }
- fixedRunLength += 1;
-
- // if fixed run met the minimum condition and if variable
- // run is non-zero then flush the variable run and shift the
- // tail fixed runs to start of the buffer
- if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) {
- numLiterals -= MIN_REPEAT;
- variableRunLength -= MIN_REPEAT - 1;
- // copy the tail fixed runs
- long[] tailVals = new long[MIN_REPEAT];
- System.arraycopy(literals, numLiterals, tailVals, 0, MIN_REPEAT);
-
- // determine variable encoding and flush values
- determineEncoding();
- writeValues();
-
- // shift tail fixed runs to beginning of the buffer
- for(long l : tailVals) {
- literals[numLiterals++] = l;
- }
- }
-
- // if fixed runs reached max repeat length then write values
- if (fixedRunLength == MAX_SCOPE) {
- determineEncoding();
- writeValues();
- }
- } else {
- // variable delta run
-
- // if fixed run length is non-zero and if it satisfies the
- // short repeat conditions then write the values as short repeats
- // else use delta encoding
- if (fixedRunLength >= MIN_REPEAT) {
- if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
- encoding = EncodingType.SHORT_REPEAT;
- writeValues();
- } else {
- encoding = EncodingType.DELTA;
- isFixedDelta = true;
- writeValues();
- }
- }
-
- // if fixed run length is 0 && fixedRunLength < MIN_REPEAT) {
- if (val != literals[numLiterals - 1]) {
- variableRunLength = fixedRunLength;
- fixedRunLength = 0;
- }
- }
-
- // after writing values re-initialize the variables
- if (numLiterals == 0) {
- initializeLiterals(val);
- } else {
- // keep updating variable run lengths
- prevDelta = val - literals[numLiterals - 1];
- literals[numLiterals++] = val;
- variableRunLength += 1;
-
- // if variable run length reach the max scope, write it
- if (variableRunLength == MAX_SCOPE) {
- determineEncoding();
- writeValues();
- }
- }
- }
- }
- }
- }
-
- private void initializeLiterals(long val) {
- literals[numLiterals++] = val;
- fixedRunLength = 1;
- variableRunLength = 1;
- }
-
- @Override
- public void getPosition(PositionRecorder recorder) throws IOException {
- output.getPosition(recorder);
- recorder.addPosition(numLiterals);
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SerializationUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SerializationUtils.java
deleted file mode 100644
index 53687b7fdb..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SerializationUtils.java
+++ /dev/null
@@ -1,844 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.thirdparty.orc;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.math.BigInteger;
-
-final class SerializationUtils {
-
- private final static int BUFFER_SIZE = 64;
- private final byte[] readBuffer;
- private final byte[] writeBuffer;
-
- public SerializationUtils() {
- this.readBuffer = new byte[BUFFER_SIZE];
- this.writeBuffer = new byte[BUFFER_SIZE];
- }
-
- void writeVulong(OutputStream output, long value) throws IOException {
- while (true) {
- if ((value & ~0x7f) == 0) {
- output.write((byte) value);
- return;
- } else {
- output.write((byte) (0x80 | (value & 0x7f)));
- value >>>= 7;
- }
- }
- }
-
- void writeVslong(OutputStream output, long value) throws IOException {
- writeVulong(output, (value << 1) ^ (value >> 63));
- }
-
-
- long readVulong(InputStream in) throws IOException {
- long result = 0;
- long b;
- int offset = 0;
- do {
- b = in.read();
- if (b == -1) {
- throw new EOFException("Reading Vulong past EOF");
- }
- result |= (0x7f & b) << offset;
- offset += 7;
- } while (b >= 0x80);
- return result;
- }
-
- long readVslong(InputStream in) throws IOException {
- long result = readVulong(in);
- return (result >>> 1) ^ -(result & 1);
- }
-
- float readFloat(InputStream in) throws IOException {
- int ser = in.read() | (in.read() << 8) | (in.read() << 16) |
- (in.read() << 24);
- return Float.intBitsToFloat(ser);
- }
-
- void writeFloat(OutputStream output, float value) throws IOException {
- int ser = Float.floatToIntBits(value);
- output.write(ser & 0xff);
- output.write((ser >> 8) & 0xff);
- output.write((ser >> 16) & 0xff);
- output.write((ser >> 24) & 0xff);
- }
-
- double readDouble(InputStream in) throws IOException {
- return Double.longBitsToDouble(readLongLE(in));
- }
-
- long readLongLE(InputStream in) throws IOException {
- in.read(readBuffer, 0, 8);
- return (((readBuffer[0] & 0xff) << 0)
- + ((readBuffer[1] & 0xff) << 8)
- + ((readBuffer[2] & 0xff) << 16)
- + ((long) (readBuffer[3] & 0xff) << 24)
- + ((long) (readBuffer[4] & 0xff) << 32)
- + ((long) (readBuffer[5] & 0xff) << 40)
- + ((long) (readBuffer[6] & 0xff) << 48)
- + ((long) (readBuffer[7] & 0xff) << 56));
- }
-
- void writeDouble(OutputStream output, double value) throws IOException {
- writeLongLE(output, Double.doubleToLongBits(value));
- }
-
- private void writeLongLE(OutputStream output, long value) throws IOException {
- writeBuffer[0] = (byte) ((value >> 0) & 0xff);
- writeBuffer[1] = (byte) ((value >> 8) & 0xff);
- writeBuffer[2] = (byte) ((value >> 16) & 0xff);
- writeBuffer[3] = (byte) ((value >> 24) & 0xff);
- writeBuffer[4] = (byte) ((value >> 32) & 0xff);
- writeBuffer[5] = (byte) ((value >> 40) & 0xff);
- writeBuffer[6] = (byte) ((value >> 48) & 0xff);
- writeBuffer[7] = (byte) ((value >> 56) & 0xff);
- output.write(writeBuffer, 0, 8);
- }
-
- /**
- * Write the arbitrarily sized signed BigInteger in vint format.
- *
- * Signed integers are encoded using the low bit as the sign bit using zigzag
- * encoding.
- *
- * Each byte uses the low 7 bits for data and the high bit for stop/continue.
- *
- * Bytes are stored LSB first.
- * @param output the stream to write to
- * @param value the value to output
- * @throws IOException
- */
- static void writeBigInteger(OutputStream output,
- BigInteger value) throws IOException {
- // encode the signed number as a positive integer
- value = value.shiftLeft(1);
- int sign = value.signum();
- if (sign < 0) {
- value = value.negate();
- value = value.subtract(BigInteger.ONE);
- }
- int length = value.bitLength();
- while (true) {
- long lowBits = value.longValue() & 0x7fffffffffffffffL;
- length -= 63;
- // write out the next 63 bits worth of data
- for(int i=0; i < 9; ++i) {
- // if this is the last byte, leave the high bit off
- if (length <= 0 && (lowBits & ~0x7f) == 0) {
- output.write((byte) lowBits);
- return;
- } else {
- output.write((byte) (0x80 | (lowBits & 0x7f)));
- lowBits >>>= 7;
- }
- }
- value = value.shiftRight(63);
- }
- }
-
- /**
- * Read the signed arbitrary sized BigInteger BigInteger in vint format
- * @param input the stream to read from
- * @return the read BigInteger
- * @throws IOException
- */
- static BigInteger readBigInteger(InputStream input) throws IOException {
- BigInteger result = BigInteger.ZERO;
- long work = 0;
- int offset = 0;
- long b;
- do {
- b = input.read();
- if (b == -1) {
- throw new EOFException("Reading BigInteger past EOF from " + input);
- }
- work |= (0x7f & b) << (offset % 63);
- offset += 7;
- // if we've read 63 bits, roll them into the result
- if (offset == 63) {
- result = BigInteger.valueOf(work);
- work = 0;
- } else if (offset % 63 == 0) {
- result = result.or(BigInteger.valueOf(work).shiftLeft(offset-63));
- work = 0;
- }
- } while (b >= 0x80);
- if (work != 0) {
- result = result.or(BigInteger.valueOf(work).shiftLeft((offset/63)*63));
- }
- // convert back to a signed number
- boolean isNegative = result.testBit(0);
- if (isNegative) {
- result = result.add(BigInteger.ONE);
- result = result.negate();
- }
- result = result.shiftRight(1);
- return result;
- }
-
- enum FixedBitSizes {
- ONE, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE,
- THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN,
- TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX,
- TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR;
- }
-
- /**
- * Count the number of bits required to encode the given value
- * @param value
- * @return bits required to store value
- */
- int findClosestNumBits(long value) {
- int count = 0;
- while (value != 0) {
- count++;
- value = value >>> 1;
- }
- return getClosestFixedBits(count);
- }
-
- /**
- * zigzag encode the given value
- * @param val
- * @return zigzag encoded value
- */
- long zigzagEncode(long val) {
- return (val << 1) ^ (val >> 63);
- }
-
- /**
- * zigzag decode the given value
- * @param val
- * @return zizag decoded value
- */
- long zigzagDecode(long val) {
- return (val >>> 1) ^ -(val & 1);
- }
-
- /**
- * Compute the bits required to represent pth percentile value
- * @param data - array
- * @param p - percentile value (>=0.0 to <=1.0)
- * @return pth percentile bits
- */
- int percentileBits(long[] data, int offset, int length, double p) {
- if ((p > 1.0) || (p <= 0.0)) {
- return -1;
- }
-
- // histogram that store the encoded bit requirement for each values.
- // maximum number of bits that can encoded is 32 (refer FixedBitSizes)
- int[] hist = new int[32];
-
- // compute the histogram
- for(int i = offset; i < (offset + length); i++) {
- int idx = encodeBitWidth(findClosestNumBits(data[i]));
- hist[idx] += 1;
- }
-
- int perLen = (int) (length * (1.0 - p));
-
- // return the bits required by pth percentile length
- for(int i = hist.length - 1; i >= 0; i--) {
- perLen -= hist[i];
- if (perLen < 0) {
- return decodeBitWidth(i);
- }
- }
-
- return 0;
- }
-
- /**
- * Calculate the number of bytes required
- * @param n - number of values
- * @param numBits - bit width
- * @return number of bytes required
- */
- int getTotalBytesRequired(int n, int numBits) {
- return (n * numBits + 7) / 8;
- }
-
- /**
- * For a given fixed bit this function will return the closest available fixed
- * bit
- * @param n
- * @return closest valid fixed bit
- */
- int getClosestFixedBits(int n) {
- if (n == 0) {
- return 1;
- }
-
- if (n >= 1 && n <= 24) {
- return n;
- } else if (n > 24 && n <= 26) {
- return 26;
- } else if (n > 26 && n <= 28) {
- return 28;
- } else if (n > 28 && n <= 30) {
- return 30;
- } else if (n > 30 && n <= 32) {
- return 32;
- } else if (n > 32 && n <= 40) {
- return 40;
- } else if (n > 40 && n <= 48) {
- return 48;
- } else if (n > 48 && n <= 56) {
- return 56;
- } else {
- return 64;
- }
- }
-
- public int getClosestAlignedFixedBits(int n) {
- if (n == 0 || n == 1) {
- return 1;
- } else if (n > 1 && n <= 2) {
- return 2;
- } else if (n > 2 && n <= 4) {
- return 4;
- } else if (n > 4 && n <= 8) {
- return 8;
- } else if (n > 8 && n <= 16) {
- return 16;
- } else if (n > 16 && n <= 24) {
- return 24;
- } else if (n > 24 && n <= 32) {
- return 32;
- } else if (n > 32 && n <= 40) {
- return 40;
- } else if (n > 40 && n <= 48) {
- return 48;
- } else if (n > 48 && n <= 56) {
- return 56;
- } else {
- return 64;
- }
- }
-
- /**
- * Finds the closest available fixed bit width match and returns its encoded
- * value (ordinal)
- * @param n - fixed bit width to encode
- * @return encoded fixed bit width
- */
- int encodeBitWidth(int n) {
- n = getClosestFixedBits(n);
-
- if (n >= 1 && n <= 24) {
- return n - 1;
- } else if (n > 24 && n <= 26) {
- return FixedBitSizes.TWENTYSIX.ordinal();
- } else if (n > 26 && n <= 28) {
- return FixedBitSizes.TWENTYEIGHT.ordinal();
- } else if (n > 28 && n <= 30) {
- return FixedBitSizes.THIRTY.ordinal();
- } else if (n > 30 && n <= 32) {
- return FixedBitSizes.THIRTYTWO.ordinal();
- } else if (n > 32 && n <= 40) {
- return FixedBitSizes.FORTY.ordinal();
- } else if (n > 40 && n <= 48) {
- return FixedBitSizes.FORTYEIGHT.ordinal();
- } else if (n > 48 && n <= 56) {
- return FixedBitSizes.FIFTYSIX.ordinal();
- } else {
- return FixedBitSizes.SIXTYFOUR.ordinal();
- }
- }
-
- /**
- * Decodes the ordinal fixed bit value to actual fixed bit width value
- * @param n - encoded fixed bit width
- * @return decoded fixed bit width
- */
- int decodeBitWidth(int n) {
- if (n >= FixedBitSizes.ONE.ordinal()
- && n <= FixedBitSizes.TWENTYFOUR.ordinal()) {
- return n + 1;
- } else if (n == FixedBitSizes.TWENTYSIX.ordinal()) {
- return 26;
- } else if (n == FixedBitSizes.TWENTYEIGHT.ordinal()) {
- return 28;
- } else if (n == FixedBitSizes.THIRTY.ordinal()) {
- return 30;
- } else if (n == FixedBitSizes.THIRTYTWO.ordinal()) {
- return 32;
- } else if (n == FixedBitSizes.FORTY.ordinal()) {
- return 40;
- } else if (n == FixedBitSizes.FORTYEIGHT.ordinal()) {
- return 48;
- } else if (n == FixedBitSizes.FIFTYSIX.ordinal()) {
- return 56;
- } else {
- return 64;
- }
- }
-
- /**
- * Bitpack and write the input values to underlying output stream
- * @param input - values to write
- * @param offset - offset
- * @param len - length
- * @param bitSize - bit width
- * @param output - output stream
- * @throws IOException
- */
- void writeInts(long[] input, int offset, int len, int bitSize,
- OutputStream output) throws IOException {
- if (input == null || input.length < 1 || offset < 0 || len < 1
- || bitSize < 1) {
- return;
- }
-
- switch (bitSize) {
- case 1:
- unrolledBitPack1(input, offset, len, output);
- return;
- case 2:
- unrolledBitPack2(input, offset, len, output);
- return;
- case 4:
- unrolledBitPack4(input, offset, len, output);
- return;
- case 8:
- unrolledBitPack8(input, offset, len, output);
- return;
- case 16:
- unrolledBitPack16(input, offset, len, output);
- return;
- case 24:
- unrolledBitPack24(input, offset, len, output);
- return;
- case 32:
- unrolledBitPack32(input, offset, len, output);
- return;
- case 40:
- unrolledBitPack40(input, offset, len, output);
- return;
- case 48:
- unrolledBitPack48(input, offset, len, output);
- return;
- case 56:
- unrolledBitPack56(input, offset, len, output);
- return;
- case 64:
- unrolledBitPack64(input, offset, len, output);
- return;
- default:
- break;
- }
-
- int bitsLeft = 8;
- byte current = 0;
- for(int i = offset; i < (offset + len); i++) {
- long value = input[i];
- int bitsToWrite = bitSize;
- while (bitsToWrite > bitsLeft) {
- // add the bits to the bottom of the current word
- current |= value >>> (bitsToWrite - bitsLeft);
- // subtract out the bits we just added
- bitsToWrite -= bitsLeft;
- // zero out the bits above bitsToWrite
- value &= (1L << bitsToWrite) - 1;
- output.write(current);
- current = 0;
- bitsLeft = 8;
- }
- bitsLeft -= bitsToWrite;
- current |= value << bitsLeft;
- if (bitsLeft == 0) {
- output.write(current);
- current = 0;
- bitsLeft = 8;
- }
- }
-
- // flush
- if (bitsLeft != 8) {
- output.write(current);
- current = 0;
- bitsLeft = 8;
- }
- }
-
- private void unrolledBitPack1(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- final int numHops = 8;
- final int remainder = len % numHops;
- final int endOffset = offset + len;
- final int endUnroll = endOffset - remainder;
- int val = 0;
- for (int i = offset; i < endUnroll; i = i + numHops) {
- val = (int) (val | ((input[i] & 1) << 7)
- | ((input[i + 1] & 1) << 6)
- | ((input[i + 2] & 1) << 5)
- | ((input[i + 3] & 1) << 4)
- | ((input[i + 4] & 1) << 3)
- | ((input[i + 5] & 1) << 2)
- | ((input[i + 6] & 1) << 1)
- | (input[i + 7]) & 1);
- output.write(val);
- val = 0;
- }
-
- if (remainder > 0) {
- int startShift = 7;
- for (int i = endUnroll; i < endOffset; i++) {
- val = (int) (val | (input[i] & 1) << startShift);
- startShift -= 1;
- }
- output.write(val);
- }
- }
-
- private void unrolledBitPack2(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- final int numHops = 4;
- final int remainder = len % numHops;
- final int endOffset = offset + len;
- final int endUnroll = endOffset - remainder;
- int val = 0;
- for (int i = offset; i < endUnroll; i = i + numHops) {
- val = (int) (val | ((input[i] & 3) << 6)
- | ((input[i + 1] & 3) << 4)
- | ((input[i + 2] & 3) << 2)
- | (input[i + 3]) & 3);
- output.write(val);
- val = 0;
- }
-
- if (remainder > 0) {
- int startShift = 6;
- for (int i = endUnroll; i < endOffset; i++) {
- val = (int) (val | (input[i] & 3) << startShift);
- startShift -= 2;
- }
- output.write(val);
- }
- }
-
- private void unrolledBitPack4(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- final int numHops = 2;
- final int remainder = len % numHops;
- final int endOffset = offset + len;
- final int endUnroll = endOffset - remainder;
- int val = 0;
- for (int i = offset; i < endUnroll; i = i + numHops) {
- val = (int) (val | ((input[i] & 15) << 4) | (input[i + 1]) & 15);
- output.write(val);
- val = 0;
- }
-
- if (remainder > 0) {
- int startShift = 4;
- for (int i = endUnroll; i < endOffset; i++) {
- val = (int) (val | (input[i] & 15) << startShift);
- startShift -= 4;
- }
- output.write(val);
- }
- }
-
- private void unrolledBitPack8(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 1);
- }
-
- private void unrolledBitPack16(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 2);
- }
-
- private void unrolledBitPack24(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 3);
- }
-
- private void unrolledBitPack32(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 4);
- }
-
- private void unrolledBitPack40(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 5);
- }
-
- private void unrolledBitPack48(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 6);
- }
-
- private void unrolledBitPack56(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 7);
- }
-
- private void unrolledBitPack64(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 8);
- }
-
- private void unrolledBitPackBytes(long[] input, int offset, int len, OutputStream output, int numBytes) throws IOException {
- final int numHops = 8;
- final int remainder = len % numHops;
- final int endOffset = offset + len;
- final int endUnroll = endOffset - remainder;
- int i = offset;
- for (; i < endUnroll; i = i + numHops) {
- writeLongBE(output, input, i, numHops, numBytes);
- }
-
- if (remainder > 0) {
- writeRemainingLongs(output, i, input, remainder, numBytes);
- }
- }
-
- private void writeRemainingLongs(OutputStream output, int offset, long[] input, int remainder,
- int numBytes) throws IOException {
- final int numHops = remainder;
-
- int idx = 0;
- switch (numBytes) {
- case 1:
- while (remainder > 0) {
- writeBuffer[idx] = (byte) (input[offset + idx] & 255);
- remainder--;
- idx++;
- }
- break;
- case 2:
- while (remainder > 0) {
- writeLongBE2(output, input[offset + idx], idx * 2);
- remainder--;
- idx++;
- }
- break;
- case 3:
- while (remainder > 0) {
- writeLongBE3(output, input[offset + idx], idx * 3);
- remainder--;
- idx++;
- }
- break;
- case 4:
- while (remainder > 0) {
- writeLongBE4(output, input[offset + idx], idx * 4);
- remainder--;
- idx++;
- }
- break;
- case 5:
- while (remainder > 0) {
- writeLongBE5(output, input[offset + idx], idx * 5);
- remainder--;
- idx++;
- }
- break;
- case 6:
- while (remainder > 0) {
- writeLongBE6(output, input[offset + idx], idx * 6);
- remainder--;
- idx++;
- }
- break;
- case 7:
- while (remainder > 0) {
- writeLongBE7(output, input[offset + idx], idx * 7);
- remainder--;
- idx++;
- }
- break;
- case 8:
- while (remainder > 0) {
- writeLongBE8(output, input[offset + idx], idx * 8);
- remainder--;
- idx++;
- }
- break;
- default:
- break;
- }
-
- final int toWrite = numHops * numBytes;
- output.write(writeBuffer, 0, toWrite);
- }
-
- private void writeLongBE(OutputStream output, long[] input, int offset, int numHops, int numBytes) throws IOException {
-
- switch (numBytes) {
- case 1:
- writeBuffer[0] = (byte) (input[offset + 0] & 255);
- writeBuffer[1] = (byte) (input[offset + 1] & 255);
- writeBuffer[2] = (byte) (input[offset + 2] & 255);
- writeBuffer[3] = (byte) (input[offset + 3] & 255);
- writeBuffer[4] = (byte) (input[offset + 4] & 255);
- writeBuffer[5] = (byte) (input[offset + 5] & 255);
- writeBuffer[6] = (byte) (input[offset + 6] & 255);
- writeBuffer[7] = (byte) (input[offset + 7] & 255);
- break;
- case 2:
- writeLongBE2(output, input[offset + 0], 0);
- writeLongBE2(output, input[offset + 1], 2);
- writeLongBE2(output, input[offset + 2], 4);
- writeLongBE2(output, input[offset + 3], 6);
- writeLongBE2(output, input[offset + 4], 8);
- writeLongBE2(output, input[offset + 5], 10);
- writeLongBE2(output, input[offset + 6], 12);
- writeLongBE2(output, input[offset + 7], 14);
- break;
- case 3:
- writeLongBE3(output, input[offset + 0], 0);
- writeLongBE3(output, input[offset + 1], 3);
- writeLongBE3(output, input[offset + 2], 6);
- writeLongBE3(output, input[offset + 3], 9);
- writeLongBE3(output, input[offset + 4], 12);
- writeLongBE3(output, input[offset + 5], 15);
- writeLongBE3(output, input[offset + 6], 18);
- writeLongBE3(output, input[offset + 7], 21);
- break;
- case 4:
- writeLongBE4(output, input[offset + 0], 0);
- writeLongBE4(output, input[offset + 1], 4);
- writeLongBE4(output, input[offset + 2], 8);
- writeLongBE4(output, input[offset + 3], 12);
- writeLongBE4(output, input[offset + 4], 16);
- writeLongBE4(output, input[offset + 5], 20);
- writeLongBE4(output, input[offset + 6], 24);
- writeLongBE4(output, input[offset + 7], 28);
- break;
- case 5:
- writeLongBE5(output, input[offset + 0], 0);
- writeLongBE5(output, input[offset + 1], 5);
- writeLongBE5(output, input[offset + 2], 10);
- writeLongBE5(output, input[offset + 3], 15);
- writeLongBE5(output, input[offset + 4], 20);
- writeLongBE5(output, input[offset + 5], 25);
- writeLongBE5(output, input[offset + 6], 30);
- writeLongBE5(output, input[offset + 7], 35);
- break;
- case 6:
- writeLongBE6(output, input[offset + 0], 0);
- writeLongBE6(output, input[offset + 1], 6);
- writeLongBE6(output, input[offset + 2], 12);
- writeLongBE6(output, input[offset + 3], 18);
- writeLongBE6(output, input[offset + 4], 24);
- writeLongBE6(output, input[offset + 5], 30);
- writeLongBE6(output, input[offset + 6], 36);
- writeLongBE6(output, input[offset + 7], 42);
- break;
- case 7:
- writeLongBE7(output, input[offset + 0], 0);
- writeLongBE7(output, input[offset + 1], 7);
- writeLongBE7(output, input[offset + 2], 14);
- writeLongBE7(output, input[offset + 3], 21);
- writeLongBE7(output, input[offset + 4], 28);
- writeLongBE7(output, input[offset + 5], 35);
- writeLongBE7(output, input[offset + 6], 42);
- writeLongBE7(output, input[offset + 7], 49);
- break;
- case 8:
- writeLongBE8(output, input[offset + 0], 0);
- writeLongBE8(output, input[offset + 1], 8);
- writeLongBE8(output, input[offset + 2], 16);
- writeLongBE8(output, input[offset + 3], 24);
- writeLongBE8(output, input[offset + 4], 32);
- writeLongBE8(output, input[offset + 5], 40);
- writeLongBE8(output, input[offset + 6], 48);
- writeLongBE8(output, input[offset + 7], 56);
- break;
- default:
- break;
- }
-
- final int toWrite = numHops * numBytes;
- output.write(writeBuffer, 0, toWrite);
- }
-
- private void writeLongBE2(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 0);
- }
-
- private void writeLongBE3(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 16);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 2] = (byte) (val >>> 0);
- }
-
- private void writeLongBE4(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 24);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 16);
- writeBuffer[wbOffset + 2] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 3] = (byte) (val >>> 0);
- }
-
- private void writeLongBE5(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 32);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 24);
- writeBuffer[wbOffset + 2] = (byte) (val >>> 16);
- writeBuffer[wbOffset + 3] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 4] = (byte) (val >>> 0);
- }
-
- private void writeLongBE6(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 40);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 32);
- writeBuffer[wbOffset + 2] = (byte) (val >>> 24);
- writeBuffer[wbOffset + 3] = (byte) (val >>> 16);
- writeBuffer[wbOffset + 4] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 5] = (byte) (val >>> 0);
- }
-
- private void writeLongBE7(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 48);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 40);
- writeBuffer[wbOffset + 2] = (byte) (val >>> 32);
- writeBuffer[wbOffset + 3] = (byte) (val >>> 24);
- writeBuffer[wbOffset + 4] = (byte) (val >>> 16);
- writeBuffer[wbOffset + 5] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 6] = (byte) (val >>> 0);
- }
-
- private void writeLongBE8(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 56);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 48);
- writeBuffer[wbOffset + 2] = (byte) (val >>> 40);
- writeBuffer[wbOffset + 3] = (byte) (val >>> 32);
- writeBuffer[wbOffset + 4] = (byte) (val >>> 24);
- writeBuffer[wbOffset + 5] = (byte) (val >>> 16);
- writeBuffer[wbOffset + 6] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 7] = (byte) (val >>> 0);
- }
-
- // Do not want to use Guava LongMath.checkedSubtract() here as it will throw
- // ArithmeticException in case of overflow
- public boolean isSafeSubtract(long left, long right) {
- return (left ^ right) >= 0 | (left ^ (left - right)) >= 0;
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SnappyCodec.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SnappyCodec.java
deleted file mode 100644
index 285a32aeb8..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SnappyCodec.java
+++ /dev/null
@@ -1,109 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.thirdparty.orc;
-
-import org.apache.hadoop.hive.shims.HadoopShims.DirectCompressionType;
-import org.apache.hadoop.hive.shims.HadoopShims.DirectDecompressorShim;
-import org.apache.hadoop.hive.shims.ShimLoader;
-import org.iq80.snappy.Snappy;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.EnumSet;
-
-class SnappyCodec implements CompressionCodec, DirectDecompressionCodec {
-
- Boolean direct = null;
-
- @Override
- public boolean compress(ByteBuffer in, ByteBuffer out,
- ByteBuffer overflow) throws IOException {
- int inBytes = in.remaining();
- // I should work on a patch for Snappy to support an overflow buffer
- // to prevent the extra buffer copy.
- byte[] compressed = new byte[Snappy.maxCompressedLength(inBytes)];
- int outBytes =
- Snappy.compress(in.array(), in.arrayOffset() + in.position(), inBytes,
- compressed, 0);
- if (outBytes < inBytes) {
- int remaining = out.remaining();
- if (remaining >= outBytes) {
- System.arraycopy(compressed, 0, out.array(), out.arrayOffset() +
- out.position(), outBytes);
- out.position(out.position() + outBytes);
- } else {
- System.arraycopy(compressed, 0, out.array(), out.arrayOffset() +
- out.position(), remaining);
- out.position(out.limit());
- System.arraycopy(compressed, remaining, overflow.array(),
- overflow.arrayOffset(), outBytes - remaining);
- overflow.position(outBytes - remaining);
- }
- return true;
- } else {
- return false;
- }
- }
-
- @Override
- public void decompress(ByteBuffer in, ByteBuffer out) throws IOException {
- if(in.isDirect() && out.isDirect()) {
- directDecompress(in, out);
- return;
- }
- int inOffset = in.position();
- int uncompressLen =
- Snappy.uncompress(in.array(), in.arrayOffset() + inOffset,
- in.limit() - inOffset, out.array(), out.arrayOffset() + out.position());
- out.position(uncompressLen + out.position());
- out.flip();
- }
-
- @Override
- public boolean isAvailable() {
- if (direct == null) {
- try {
- if (ShimLoader.getHadoopShims().getDirectDecompressor(
- DirectCompressionType.SNAPPY) != null) {
- direct = Boolean.valueOf(true);
- } else {
- direct = Boolean.valueOf(false);
- }
- } catch (UnsatisfiedLinkError ule) {
- direct = Boolean.valueOf(false);
- }
- }
- return direct.booleanValue();
- }
-
- @Override
- public void directDecompress(ByteBuffer in, ByteBuffer out)
- throws IOException {
- DirectDecompressorShim decompressShim = ShimLoader.getHadoopShims()
- .getDirectDecompressor(DirectCompressionType.SNAPPY);
- decompressShim.decompress(in, out);
- out.flip(); // flip for read
- }
-
- @Override
- public CompressionCodec modify(EnumSet modifiers) {
- // snappy allows no modifications
- return this;
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamName.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamName.java
deleted file mode 100644
index 382164530c..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamName.java
+++ /dev/null
@@ -1,95 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.thirdparty.orc;
-
-/**
- * The name of a stream within a stripe.
- */
-class StreamName implements Comparable {
- private final int column;
- private final OrcProto.Stream.Kind kind;
-
- public enum Area {
- DATA, INDEX
- }
-
- public StreamName(int column, OrcProto.Stream.Kind kind) {
- this.column = column;
- this.kind = kind;
- }
-
- public boolean equals(Object obj) {
- if (obj != null && obj instanceof StreamName) {
- StreamName other = (StreamName) obj;
- return other.column == column && other.kind == kind;
- } else {
- return false;
- }
- }
-
- @Override
- public int compareTo(StreamName streamName) {
- if (streamName == null) {
- return -1;
- }
- Area area = getArea(kind);
- Area otherArea = StreamName.getArea(streamName.kind);
- if (area != otherArea) {
- return -area.compareTo(otherArea);
- }
- if (column != streamName.column) {
- return column < streamName.column ? -1 : 1;
- }
- return kind.compareTo(streamName.kind);
- }
-
- public int getColumn() {
- return column;
- }
-
- public OrcProto.Stream.Kind getKind() {
- return kind;
- }
-
- public Area getArea() {
- return getArea(kind);
- }
-
- public static Area getArea(OrcProto.Stream.Kind kind) {
- switch (kind) {
- case ROW_INDEX:
- case DICTIONARY_COUNT:
- case BLOOM_FILTER:
- return Area.INDEX;
- default:
- return Area.DATA;
- }
- }
-
- @Override
- public String toString() {
- return "Stream for column " + column + " kind " + kind;
- }
-
- @Override
- public int hashCode() {
- return column * 101 + kind.getNumber();
- }
-}
-
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringColumnStatistics.java
deleted file mode 100644
index 42486646bf..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringColumnStatistics.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tajo.storage.thirdparty.orc;
-
-/**
- * Statistics for string columns.
- */
-public interface StringColumnStatistics extends ColumnStatistics {
- /**
- * Get the minimum string.
- * @return the minimum
- */
- String getMinimum();
-
- /**
- * Get the maximum string.
- * @return the maximum
- */
- String getMaximum();
-
- /**
- * Get the total length of all strings
- * @return the sum (total length)
- */
- long getSum();
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringRedBlackTree.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringRedBlackTree.java
deleted file mode 100644
index 8835cefa5e..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringRedBlackTree.java
+++ /dev/null
@@ -1,202 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tajo.storage.thirdparty.orc;
-
-import org.apache.hadoop.io.Text;
-
-import java.io.IOException;
-import java.io.OutputStream;
-
-/**
- * A red-black tree that stores strings. The strings are stored as UTF-8 bytes
- * and an offset for each entry.
- */
-class StringRedBlackTree extends RedBlackTree {
- private final DynamicByteArray byteArray = new DynamicByteArray();
- private final DynamicIntArray keyOffsets;
- private String newKey;
-
- public StringRedBlackTree(int initialCapacity) {
- super(initialCapacity);
- keyOffsets = new DynamicIntArray(initialCapacity);
- }
-
- public int add(String value) {
- newKey = value;
- return addNewKey();
- }
-
- private int addNewKey() {
- // if the newKey is actually new, add it to our byteArray and store the offset & length
- if (add()) {
- int len = newKey.length();
- keyOffsets.add(byteArray.add(newKey.getBytes(), 0, len));
- }
- return lastAdd;
- }
-
- public int add(Text value) {
- newKey = value.toString();
- return addNewKey();
- }
-
- @Override
- protected int compareValue(int position) {
- int start = keyOffsets.get(position);
- int end;
- if (position + 1 == keyOffsets.size()) {
- end = byteArray.size();
- } else {
- end = keyOffsets.get(position+1);
- }
- return byteArray.compare(newKey.getBytes(), 0, newKey.length(),
- start, end - start);
- }
-
- /**
- * The information about each node.
- */
- public interface VisitorContext {
- /**
- * Get the position where the key was originally added.
- * @return the number returned by add.
- */
- int getOriginalPosition();
-
- /**
- * Write the bytes for the string to the given output stream.
- * @param out the stream to write to.
- * @throws IOException
- */
- void writeBytes(OutputStream out) throws IOException;
-
- /**
- * Get the original string.
- * @return the string
- */
- Text getText();
-
- /**
- * Get the number of bytes.
- * @return the string's length in bytes
- */
- int getLength();
- }
-
- /**
- * The interface for visitors.
- */
- public interface Visitor {
- /**
- * Called once for each node of the tree in sort order.
- * @param context the information about each node
- * @throws IOException
- */
- void visit(VisitorContext context) throws IOException;
- }
-
- private class VisitorContextImpl implements VisitorContext {
- private int originalPosition;
- private int start;
- private int end;
- private final Text text = new Text();
-
- public int getOriginalPosition() {
- return originalPosition;
- }
-
- public Text getText() {
- byteArray.setText(text, start, end - start);
- return text;
- }
-
- public void writeBytes(OutputStream out) throws IOException {
- byteArray.write(out, start, end - start);
- }
-
- public int getLength() {
- return end - start;
- }
-
- void setPosition(int position) {
- originalPosition = position;
- start = keyOffsets.get(originalPosition);
- if (position + 1 == keyOffsets.size()) {
- end = byteArray.size();
- } else {
- end = keyOffsets.get(originalPosition + 1);
- }
- }
- }
-
- private void recurse(int node, Visitor visitor, VisitorContextImpl context
- ) throws IOException {
- if (node != NULL) {
- recurse(getLeft(node), visitor, context);
- context.setPosition(node);
- visitor.visit(context);
- recurse(getRight(node), visitor, context);
- }
- }
-
- /**
- * Visit all of the nodes in the tree in sorted order.
- * @param visitor the action to be applied to each node
- * @throws IOException
- */
- public void visit(Visitor visitor) throws IOException {
- recurse(root, visitor, new VisitorContextImpl());
- }
-
- /**
- * Reset the table to empty.
- */
- public void clear() {
- super.clear();
- byteArray.clear();
- keyOffsets.clear();
- }
-
- public void getText(Text result, int originalPosition) {
- int offset = keyOffsets.get(originalPosition);
- int length;
- if (originalPosition + 1 == keyOffsets.size()) {
- length = byteArray.size() - offset;
- } else {
- length = keyOffsets.get(originalPosition + 1) - offset;
- }
- byteArray.setText(result, offset, length);
- }
-
- /**
- * Get the size of the character data in the table.
- * @return the bytes used by the table
- */
- public int getCharacterSize() {
- return byteArray.size();
- }
-
- /**
- * Calculate the approximate size in memory.
- * @return the number of bytes used in storing the tree.
- */
- public long getSizeInBytes() {
- return byteArray.getSizeInBytes() + keyOffsets.getSizeInBytes() +
- super.getSizeInBytes();
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeInformation.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeInformation.java
deleted file mode 100644
index 62819c1a22..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeInformation.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tajo.storage.thirdparty.orc;
-
-/**
- * Information about the stripes in an ORC file that is provided by the Reader.
- */
-public interface StripeInformation {
- /**
- * Get the byte offset of the start of the stripe.
- * @return the bytes from the start of the file
- */
- long getOffset();
-
- /**
- * Get the total length of the stripe in bytes.
- * @return the number of bytes in the stripe
- */
- long getLength();
-
- /**
- * Get the length of the stripe's indexes.
- * @return the number of bytes in the index
- */
- long getIndexLength();
-
- /**
- * Get the length of the stripe's data.
- * @return the number of bytes in the stripe
- */
- long getDataLength();
-
- /**
- * Get the length of the stripe's tail section, which contains its index.
- * @return the number of bytes in the tail
- */
- long getFooterLength();
-
- /**
- * Get the number of rows in the stripe.
- * @return a count of the number of rows
- */
- long getNumberOfRows();
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeStatistics.java
deleted file mode 100644
index 013fc8ec80..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeStatistics.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.thirdparty.orc;
-
-import java.util.List;
-
-public class StripeStatistics {
- private final List cs;
-
- StripeStatistics(List list) {
- this.cs = list;
- }
-
- /**
- * Return list of column statistics
- *
- * @return column stats
- */
- public ColumnStatistics[] getColumnStatistics() {
- ColumnStatistics[] result = new ColumnStatistics[cs.size()];
- for (int i = 0; i < result.length; ++i) {
- result[i] = ColumnStatisticsImpl.deserialize(cs.get(i));
- }
- return result;
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TimestampColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TimestampColumnStatistics.java
deleted file mode 100644
index 6fad0ac1fe..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TimestampColumnStatistics.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.thirdparty.orc;
-
-import java.sql.Timestamp;
-
-/**
- * Statistics for Timestamp columns.
- */
-public interface TimestampColumnStatistics extends ColumnStatistics {
- /**
- * Get the minimum value for the column.
- * @return minimum value
- */
- Timestamp getMinimum();
-
- /**
- * Get the maximum value for the column.
- * @return maximum value
- */
- Timestamp getMaximum();
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java
new file mode 100644
index 0000000000..6ab630aed1
--- /dev/null
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java
@@ -0,0 +1,1557 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.thirdparty.orc;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.io.Text;
+import org.apache.orc.OrcProto;
+import org.apache.orc.impl.*;
+import org.apache.tajo.catalog.Column;
+import org.apache.tajo.catalog.TypeDesc;
+import org.apache.tajo.datum.Datum;
+import org.apache.tajo.datum.DatumFactory;
+import org.apache.tajo.datum.NullDatum;
+import org.apache.tajo.exception.TajoRuntimeException;
+import org.apache.tajo.exception.UnsupportedException;
+import org.apache.tajo.util.datetime.DateTimeConstants;
+import org.apache.tajo.util.datetime.DateTimeUtil;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.Timestamp;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TimeZone;
+
+import static org.apache.tajo.storage.thirdparty.orc.WriterImpl.BASE_TIMESTAMP_STRING;
+
+public class TreeReaderFactory {
+
+ private final static Log LOG = LogFactory.getLog(TreeReaderFactory.class);
+
+ public static class TreeReaderSchema {
+
+ /**
+ * The types in the ORC file.
+ */
+ List fileTypes;
+
+ /**
+ * The treeReaderSchema that the reader should read as.
+ */
+ List schemaTypes;
+
+ /**
+ * The subtype of the row STRUCT. Different than 0 for ACID.
+ */
+ int innerStructSubtype;
+
+ public TreeReaderSchema() {
+ fileTypes = null;
+ schemaTypes = null;
+ innerStructSubtype = -1;
+ }
+
+ public TreeReaderSchema fileTypes(List fileTypes) {
+ this.fileTypes = fileTypes;
+ return this;
+ }
+
+ public TreeReaderSchema schemaTypes(List schemaTypes) {
+ this.schemaTypes = schemaTypes;
+ return this;
+ }
+
+ public TreeReaderSchema innerStructSubtype(int innerStructSubtype) {
+ this.innerStructSubtype = innerStructSubtype;
+ return this;
+ }
+
+ public List getFileTypes() {
+ return fileTypes;
+ }
+
+ public List getSchemaTypes() {
+ return schemaTypes;
+ }
+
+ public int getInnerStructSubtype() {
+ return innerStructSubtype;
+ }
+ }
+
+ public abstract static class TreeReader {
+ protected final int columnId;
+ protected BitFieldReader present = null;
+ protected boolean valuePresent = false;
+
+ TreeReader(int columnId) throws IOException {
+ this(columnId, null);
+ }
+
+ protected TreeReader(int columnId, InStream in) throws IOException {
+ this.columnId = columnId;
+ if (in == null) {
+ present = null;
+ valuePresent = true;
+ } else {
+ present = new BitFieldReader(in, 1);
+ }
+ }
+
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ static IntegerReader createIntegerReader(OrcProto.ColumnEncoding.Kind kind,
+ InStream in,
+ boolean signed, boolean skipCorrupt) throws IOException {
+ switch (kind) {
+ case DIRECT_V2:
+ case DICTIONARY_V2:
+ return new RunLengthIntegerReaderV2(in, signed, skipCorrupt);
+ case DIRECT:
+ case DICTIONARY:
+ return new RunLengthIntegerReader(in, signed);
+ default:
+ throw new IllegalArgumentException("Unknown encoding " + kind);
+ }
+ }
+
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ checkEncoding(stripeFooter.getColumnsList().get(columnId));
+ InStream in = streams.get(new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.PRESENT));
+ if (in == null) {
+ present = null;
+ valuePresent = true;
+ } else {
+ present = new BitFieldReader(in, 1);
+ }
+ }
+
+ /**
+ * Seek to the given position.
+ *
+ * @param index the indexes loaded from the file
+ * @throws IOException
+ */
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ public void seek(PositionProvider index) throws IOException {
+ if (present != null) {
+ present.seek(index);
+ }
+ }
+
+ protected long countNonNulls(long rows) throws IOException {
+ if (present != null) {
+ long result = 0;
+ for (long c = 0; c < rows; ++c) {
+ if (present.next() == 1) {
+ result += 1;
+ }
+ }
+ return result;
+ } else {
+ return rows;
+ }
+ }
+
+ abstract void skipRows(long rows) throws IOException;
+
+ public BitFieldReader getPresent() {
+ return present;
+ }
+ }
+
+ public abstract static class DatumTreeReader extends TreeReader {
+
+ DatumTreeReader(int columnId) throws IOException {
+ super(columnId);
+ }
+
+ protected DatumTreeReader(int columnId, InStream in) throws IOException {
+ super(columnId, in);
+ }
+
+ Datum next() throws IOException {
+ if (present != null) {
+ valuePresent = present.next() == 1;
+ }
+ return NullDatum.get();
+ }
+ }
+
+ public abstract static class RawStringTreeReader extends TreeReader {
+ RawStringTreeReader(int columnId) throws IOException {
+ super(columnId);
+ }
+
+ protected RawStringTreeReader(int columnId, InStream in) throws IOException {
+ super(columnId, in);
+ }
+
+ byte[] next() throws IOException {
+ if (present != null) {
+ valuePresent = present.next() == 1;
+ }
+ return null;
+ }
+ }
+
+ public static class BooleanTreeReader extends DatumTreeReader {
+ protected BitFieldReader reader = null;
+
+ BooleanTreeReader(int columnId) throws IOException {
+ this(columnId, null, null);
+ }
+
+ protected BooleanTreeReader(int columnId, InStream present, InStream data) throws IOException {
+ super(columnId, present);
+ if (data != null) {
+ reader = new BitFieldReader(data, 1);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ reader = new BitFieldReader(streams.get(new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA)), 1);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+ return valuePresent ? DatumFactory.createBool(reader.next() == 1) : NullDatum.get();
+ }
+ }
+
+ public static class ByteTreeReader extends DatumTreeReader {
+ protected RunLengthByteReader reader = null;
+
+ ByteTreeReader(int columnId) throws IOException {
+ this(columnId, null, null);
+ }
+
+ protected ByteTreeReader(int columnId, InStream present, InStream data) throws IOException {
+ super(columnId, present);
+ this.reader = new RunLengthByteReader(data);
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ reader = new RunLengthByteReader(streams.get(new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA)));
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+ return valuePresent ? DatumFactory.createBit(reader.next()) : NullDatum.get();
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class ShortTreeReader extends DatumTreeReader {
+ protected IntegerReader reader = null;
+
+ ShortTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null);
+ }
+
+ protected ShortTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ this.reader = createIntegerReader(encoding.getKind(), data, true, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), true, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+ return valuePresent ? DatumFactory.createInt2((short) reader.next()) : NullDatum.get();
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class InetTreeReader extends DatumTreeReader {
+ protected IntegerReader reader = null;
+
+ InetTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null);
+ }
+
+ protected InetTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ this.reader = createIntegerReader(encoding.getKind(), data, true, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), true, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+ return valuePresent ? DatumFactory.createInet4((int) reader.next()) : NullDatum.get();
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class IntTreeReader extends DatumTreeReader {
+ protected IntegerReader reader = null;
+
+ IntTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null);
+ }
+
+ protected IntTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ this.reader = createIntegerReader(encoding.getKind(), data, true, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), true, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+ return valuePresent ? DatumFactory.createInt4((int) reader.next()) : NullDatum.get();
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class LongTreeReader extends DatumTreeReader {
+ protected IntegerReader reader = null;
+
+ LongTreeReader(int columnId, boolean skipCorrupt) throws IOException {
+ this(columnId, null, null, null, skipCorrupt);
+ }
+
+ protected LongTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding,
+ boolean skipCorrupt)
+ throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ this.reader = createIntegerReader(encoding.getKind(), data, true, skipCorrupt);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), true, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+ return valuePresent ? DatumFactory.createInt8(reader.next()) : NullDatum.get();
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class FloatTreeReader extends DatumTreeReader {
+ protected InStream stream;
+ private final org.apache.orc.impl.SerializationUtils utils;
+
+ FloatTreeReader(int columnId) throws IOException {
+ this(columnId, null, null);
+ }
+
+ protected FloatTreeReader(int columnId, InStream present, InStream data) throws IOException {
+ super(columnId, present);
+ this.utils = new org.apache.orc.impl.SerializationUtils();
+ this.stream = data;
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ stream = streams.get(name);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ stream.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+ return valuePresent ? DatumFactory.createFloat4(utils.readFloat(stream)) : NullDatum.get();
+ }
+
+ @Override
+ protected void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ for (int i = 0; i < items; ++i) {
+ utils.readFloat(stream);
+ }
+ }
+ }
+
+ public static class DoubleTreeReader extends DatumTreeReader {
+ protected InStream stream;
+ private final org.apache.orc.impl.SerializationUtils utils;
+
+ DoubleTreeReader(int columnId) throws IOException {
+ this(columnId, null, null);
+ }
+
+ protected DoubleTreeReader(int columnId, InStream present, InStream data) throws IOException {
+ super(columnId, present);
+ this.utils = new SerializationUtils();
+ this.stream = data;
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ org.apache.orc.impl.StreamName name =
+ new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ stream = streams.get(name);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ stream.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+ return valuePresent ? DatumFactory.createFloat8(utils.readDouble(stream)) : NullDatum.get();
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long len = items * 8;
+ while (len > 0) {
+ len -= stream.skip(len);
+ }
+ }
+ }
+
+ public static class BinaryTreeReader extends DatumTreeReader {
+ protected InStream stream;
+ protected IntegerReader lengths = null;
+ protected final LongColumnVector scratchlcv;
+
+ BinaryTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null, null);
+ }
+
+ protected BinaryTreeReader(int columnId, InStream present, InStream data, InStream length,
+ OrcProto.ColumnEncoding encoding) throws IOException {
+ super(columnId, present);
+ scratchlcv = new LongColumnVector();
+ this.stream = data;
+ if (length != null && encoding != null) {
+ checkEncoding(encoding);
+ this.lengths = createIntegerReader(encoding.getKind(), length, false, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ stream = streams.get(name);
+ lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.LENGTH)), false, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ stream.seek(index);
+ lengths.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+
+ if (valuePresent) {
+ int len = (int) lengths.next();
+ byte[] buf = new byte[len];
+ int offset = 0;
+ while (len > 0) {
+ int written = stream.read(buf, offset, len);
+ if (written < 0) {
+ throw new EOFException("Can't finish byte read from " + stream);
+ }
+ len -= written;
+ offset += written;
+ }
+ return DatumFactory.createBlob(buf);
+ } else {
+ return NullDatum.get();
+ }
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long lengthToSkip = 0;
+ for (int i = 0; i < items; ++i) {
+ lengthToSkip += lengths.next();
+ }
+ while (lengthToSkip > 0) {
+ lengthToSkip -= stream.skip(lengthToSkip);
+ }
+ }
+ }
+
+ public static class TimestampTreeReader extends DatumTreeReader {
+ protected IntegerReader data = null;
+ protected IntegerReader nanos = null;
+ private final boolean skipCorrupt;
+ private Map baseTimestampMap;
+ private long base_timestamp;
+ private final TimeZone readerTimeZone;
+ private TimeZone writerTimeZone;
+ private boolean hasSameTZRules;
+ private final TimeZone timeZone;
+
+ TimestampTreeReader(TimeZone timeZone, int columnId, boolean skipCorrupt) throws IOException {
+ this(timeZone, columnId, null, null, null, null, skipCorrupt);
+ }
+
+ protected TimestampTreeReader(TimeZone timeZone, int columnId, InStream presentStream, InStream dataStream,
+ InStream nanosStream, OrcProto.ColumnEncoding encoding, boolean skipCorrupt)
+ throws IOException {
+ super(columnId, presentStream);
+ this.skipCorrupt = skipCorrupt;
+ this.baseTimestampMap = new HashMap<>();
+ this.readerTimeZone = TimeZone.getDefault();
+ this.writerTimeZone = readerTimeZone;
+ this.hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone);
+ this.base_timestamp = getBaseTimestamp(readerTimeZone.getID());
+ if (encoding != null) {
+ checkEncoding(encoding);
+
+ if (dataStream != null) {
+ this.data = createIntegerReader(encoding.getKind(), dataStream, true, skipCorrupt);
+ }
+
+ if (nanosStream != null) {
+ this.nanos = createIntegerReader(encoding.getKind(), nanosStream, false, skipCorrupt);
+ }
+ }
+ this.timeZone = timeZone;
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ data = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA)), true, skipCorrupt);
+ nanos = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.SECONDARY)), false, skipCorrupt);
+ getBaseTimestamp(stripeFooter.getWriterTimezone());
+ }
+
+ private long getBaseTimestamp(String timeZoneId) throws IOException {
+ // to make sure new readers read old files in the same way
+ if (timeZoneId == null || timeZoneId.isEmpty()) {
+ timeZoneId = readerTimeZone.getID();
+ }
+
+ if (!baseTimestampMap.containsKey(timeZoneId)) {
+ writerTimeZone = TimeZone.getTimeZone(timeZoneId);
+ hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone);
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ sdf.setTimeZone(writerTimeZone);
+ try {
+ long epoch = sdf.parse(BASE_TIMESTAMP_STRING).getTime() / DateTimeConstants.MSECS_PER_SEC;
+ baseTimestampMap.put(timeZoneId, epoch);
+ return epoch;
+ } catch (ParseException e) {
+ throw new IOException("Unable to create base timestamp", e);
+ } finally {
+ sdf.setTimeZone(readerTimeZone);
+ }
+ }
+
+ return baseTimestampMap.get(timeZoneId);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ data.seek(index);
+ nanos.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+
+ if (valuePresent) {
+ long millis = decodeTimestamp(data.next(), nanos.next(), base_timestamp);
+ long adjustedMillis = millis - writerTimeZone.getRawOffset();
+ return DatumFactory.createTimestamp(DateTimeUtil.javaTimeToJulianTime(adjustedMillis));
+ } else {
+ return NullDatum.get();
+ }
+ }
+
+ private static int parseNanos(long serialized) {
+ int zeros = 7 & (int) serialized;
+ int result = (int) (serialized >>> 3);
+ if (zeros != 0) {
+ for (int i = 0; i <= zeros; ++i) {
+ result *= 10;
+ }
+ }
+ return result;
+ }
+
+ // borrowed from Facebook's TimestampStreamReader
+ private static long decodeTimestamp(long seconds, long serializedNanos, long baseTimestampInSeconds) {
+ long millis = (seconds + baseTimestampInSeconds) * DateTimeConstants.MSECS_PER_SEC;
+ long nanos = parseNanos(serializedNanos);
+
+ // the rounding error exists because java always rounds up when dividing integers
+ // -42001/1000 = -42; and -42001 % 1000 = -1 (+ 1000)
+ // to get the correct value we need
+ // (-42 - 1)*1000 + 999 = -42001
+ // (42)*1000 + 1 = 42001
+ if (millis < 0 && nanos != 0) {
+ millis -= 1000;
+ }
+ // Truncate nanos to millis and add to mills
+ return millis + (nanos / 1_000_000);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ data.skip(items);
+ nanos.skip(items);
+ }
+ }
+
+ public static class DateTreeReader extends DatumTreeReader {
+ protected IntegerReader reader = null;
+
+ DateTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null);
+ }
+
+ protected DateTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding) throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ reader = createIntegerReader(encoding.getKind(), data, true, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), true, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+ return valuePresent ?
+ DatumFactory.createDate((int) reader.next() + DateTimeUtil.DAYS_FROM_JULIAN_TO_EPOCH) : NullDatum.get();
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ /**
+ * A tree reader that will read string columns. At the start of the
+ * stripe, it creates an internal reader based on whether a direct or
+ * dictionary encoding was used.
+ */
+ public static class StringTreeReader extends DatumTreeReader {
+ protected RawStringTreeReader reader;
+
+ StringTreeReader(int columnId) throws IOException {
+ super(columnId);
+ }
+
+ protected StringTreeReader(int columnId, InStream present, InStream data, InStream length,
+ InStream dictionary, OrcProto.ColumnEncoding encoding) throws IOException {
+ super(columnId, present);
+ if (encoding != null) {
+ switch (encoding.getKind()) {
+ case DIRECT:
+ case DIRECT_V2:
+ reader = new StringDirectTreeReader(columnId, present, data, length,
+ encoding.getKind());
+ break;
+ case DICTIONARY:
+ case DICTIONARY_V2:
+ reader = new StringDictionaryTreeReader(columnId, present, data, length, dictionary,
+ encoding);
+ break;
+ default:
+ throw new IllegalArgumentException("Unsupported encoding " +
+ encoding.getKind());
+ }
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ reader.checkEncoding(encoding);
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ // For each stripe, checks the encoding and initializes the appropriate
+ // reader
+ switch (stripeFooter.getColumnsList().get(columnId).getKind()) {
+ case DIRECT:
+ case DIRECT_V2:
+ reader = new StringDirectTreeReader(columnId);
+ break;
+ case DICTIONARY:
+ case DICTIONARY_V2:
+ reader = new StringDictionaryTreeReader(columnId);
+ break;
+ default:
+ throw new IllegalArgumentException("Unsupported encoding " +
+ stripeFooter.getColumnsList().get(columnId).getKind());
+ }
+ reader.startStripe(streams, stripeFooter);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ reader.seek(index);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ reader.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ byte[] bytes = reader.next();
+ return bytes == null ? NullDatum.get() : DatumFactory.createText(bytes);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skipRows(items);
+ }
+ }
+
+ private final static class BasicTextReaderShim {
+ private final InputStream in;
+
+ public BasicTextReaderShim(InputStream in) {
+ this.in = in;
+ }
+
+ public byte[] read(int len) throws IOException {
+ int offset = 0;
+ byte[] bytes = new byte[len];
+ while (len > 0) {
+ int written = in.read(bytes, offset, len);
+ if (written < 0) {
+ throw new EOFException("Can't finish read from " + in + " read "
+ + (offset) + " bytes out of " + bytes.length);
+ }
+ len -= written;
+ offset += written;
+ }
+ return bytes;
+ }
+ }
+
+ /**
+ * A reader for string columns that are direct encoded in the current
+ * stripe.
+ */
+ public static class StringDirectTreeReader extends RawStringTreeReader {
+ protected InStream stream;
+ protected BasicTextReaderShim data;
+ protected IntegerReader lengths;
+ private final LongColumnVector scratchlcv;
+
+ StringDirectTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null, null);
+ }
+
+ protected StringDirectTreeReader(int columnId, InStream present, InStream data,
+ InStream length, OrcProto.ColumnEncoding.Kind encoding) throws IOException {
+ super(columnId, present);
+ this.scratchlcv = new LongColumnVector();
+ this.stream = data;
+ if (length != null && encoding != null) {
+ this.lengths = createIntegerReader(encoding, length, false, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT &&
+ encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ stream = streams.get(name);
+ data = new BasicTextReaderShim(stream);
+
+ lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.LENGTH)),
+ false, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ stream.seek(index);
+ // don't seek data stream
+ lengths.seek(index);
+ }
+
+ @Override
+ byte[] next() throws IOException {
+ super.next();
+ int len = (int) lengths.next();
+ return valuePresent ? data.read(len) : null;
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long lengthToSkip = 0;
+ for (int i = 0; i < items; ++i) {
+ lengthToSkip += lengths.next();
+ }
+
+ while (lengthToSkip > 0) {
+ lengthToSkip -= stream.skip(lengthToSkip);
+ }
+ }
+
+ public IntegerReader getLengths() {
+ return lengths;
+ }
+
+ public InStream getStream() {
+ return stream;
+ }
+ }
+
+ /**
+ * A reader for string columns that are dictionary encoded in the current
+ * stripe.
+ */
+ public static class StringDictionaryTreeReader extends RawStringTreeReader {
+ private org.apache.orc.impl.DynamicByteArray dictionaryBuffer;
+ private int[] dictionaryOffsets;
+ protected IntegerReader reader;
+
+ private byte[] dictionaryBufferInBytesCache = null;
+ private final LongColumnVector scratchlcv;
+ private final Text result = new Text();
+
+ StringDictionaryTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null, null, null);
+ }
+
+ protected StringDictionaryTreeReader(int columnId, InStream present, InStream data,
+ InStream length, InStream dictionary, OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ super(columnId, present);
+ scratchlcv = new LongColumnVector();
+ if (data != null && encoding != null) {
+ this.reader = createIntegerReader(encoding.getKind(), data, false, false);
+ }
+
+ if (dictionary != null && encoding != null) {
+ readDictionaryStream(dictionary);
+ }
+
+ if (length != null && encoding != null) {
+ readDictionaryLengthStream(length, encoding);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY &&
+ encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+
+ // read the dictionary blob
+ org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DICTIONARY_DATA);
+ InStream in = streams.get(name);
+ readDictionaryStream(in);
+
+ // read the lengths
+ name = new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.LENGTH);
+ in = streams.get(name);
+ readDictionaryLengthStream(in, stripeFooter.getColumnsList().get(columnId));
+
+ // set up the row reader
+ name = new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), false, false);
+ }
+
+ private void readDictionaryLengthStream(InStream in, OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ int dictionarySize = encoding.getDictionarySize();
+ if (in != null) { // Guard against empty LENGTH stream.
+ IntegerReader lenReader = createIntegerReader(encoding.getKind(), in, false, false);
+ int offset = 0;
+ if (dictionaryOffsets == null ||
+ dictionaryOffsets.length < dictionarySize + 1) {
+ dictionaryOffsets = new int[dictionarySize + 1];
+ }
+ for (int i = 0; i < dictionarySize; ++i) {
+ dictionaryOffsets[i] = offset;
+ offset += (int) lenReader.next();
+ }
+ dictionaryOffsets[dictionarySize] = offset;
+ in.close();
+ }
+
+ }
+
+ private void readDictionaryStream(InStream in) throws IOException {
+ if (in != null) { // Guard against empty dictionary stream.
+ if (in.available() > 0) {
+ dictionaryBuffer = new DynamicByteArray(64, in.available());
+ dictionaryBuffer.readAll(in);
+ // Since its start of strip invalidate the cache.
+ dictionaryBufferInBytesCache = null;
+ }
+ in.close();
+ } else {
+ dictionaryBuffer = null;
+ }
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ byte[] next() throws IOException {
+ super.next();
+ if (valuePresent) {
+ int entry = (int) reader.next();
+ int offset = dictionaryOffsets[entry];
+ int length = getDictionaryEntryLength(entry, offset);
+ // If the column is just empty strings, the size will be zero,
+ // so the buffer will be null, in that case just return result
+ // as it will default to empty
+ if (dictionaryBuffer != null) {
+ dictionaryBuffer.setText(result, offset, length);
+ } else {
+ result.clear();
+ }
+ return result.getBytes();
+ } else {
+ return null;
+ }
+ }
+
+ int getDictionaryEntryLength(int entry, int offset) {
+ final int length;
+ // if it isn't the last entry, subtract the offsets otherwise use
+ // the buffer length.
+ if (entry < dictionaryOffsets.length - 1) {
+ length = dictionaryOffsets[entry + 1] - offset;
+ } else {
+ length = dictionaryBuffer.size() - offset;
+ }
+ return length;
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+
+ public IntegerReader getReader() {
+ return reader;
+ }
+ }
+
+ /**
+ * A tree reader that will read string columns. At the start of the
+ * stripe, it creates an internal reader based on whether a direct or
+ * dictionary encoding was used.
+ */
+ public static class CharTreeReader extends DatumTreeReader {
+ protected RawStringTreeReader reader;
+ private final int maxLength;
+
+ CharTreeReader(int columnId, int maxLength) throws IOException {
+ this(columnId, null, null, null, null, null, maxLength);
+ }
+
+ protected CharTreeReader(int columnId, InStream present, InStream data, InStream length,
+ InStream dictionary, OrcProto.ColumnEncoding encoding, int maxLength) throws IOException {
+ super(columnId, present);
+ this.maxLength = maxLength;
+ if (encoding != null) {
+ switch (encoding.getKind()) {
+ case DIRECT:
+ case DIRECT_V2:
+ reader = new StringDirectTreeReader(columnId, present, data, length,
+ encoding.getKind());
+ break;
+ case DICTIONARY:
+ case DICTIONARY_V2:
+ reader = new StringDictionaryTreeReader(columnId, present, data, length, dictionary,
+ encoding);
+ break;
+ default:
+ throw new IllegalArgumentException("Unsupported encoding " +
+ encoding.getKind());
+ }
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ reader.checkEncoding(encoding);
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ // For each stripe, checks the encoding and initializes the appropriate
+ // reader
+ switch (stripeFooter.getColumnsList().get(columnId).getKind()) {
+ case DIRECT:
+ case DIRECT_V2:
+ reader = new StringDirectTreeReader(columnId);
+ break;
+ case DICTIONARY:
+ case DICTIONARY_V2:
+ reader = new StringDictionaryTreeReader(columnId);
+ break;
+ default:
+ throw new IllegalArgumentException("Unsupported encoding " +
+ stripeFooter.getColumnsList().get(columnId).getKind());
+ }
+ reader.startStripe(streams, stripeFooter);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ reader.seek(index);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ reader.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ byte[] bytes = reader.next();
+
+ if (bytes == null) {
+ return NullDatum.get();
+ }
+ // TODO: enforce char length
+ return DatumFactory.createChar(bytes);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skipRows(items);
+ }
+ }
+
+ // TODO: enable this to support record type
+// protected static class StructTreeReader extends TreeReader {
+// private final int fileColumnCount;
+// private final int resultColumnCount;
+// protected final TreeReader[] fields;
+// private final String[] fieldNames;
+//
+// protected StructTreeReader(
+// int columnId,
+// TreeReaderSchema treeReaderSchema,
+// boolean[] included,
+// boolean skipCorrupt) throws IOException {
+// super(columnId);
+//
+// OrcProto.Type fileStructType = treeReaderSchema.getFileTypes().get(columnId);
+// fileColumnCount = fileStructType.getFieldNamesCount();
+//
+// OrcProto.Type schemaStructType = treeReaderSchema.getSchemaTypes().get(columnId);
+//
+// if (columnId == treeReaderSchema.getInnerStructSubtype()) {
+// // If there are more result columns than reader columns, we will default those additional
+// // columns to NULL.
+// resultColumnCount = schemaStructType.getFieldNamesCount();
+// } else {
+// resultColumnCount = fileColumnCount;
+// }
+//
+// this.fields = new TreeReader[fileColumnCount];
+// this.fieldNames = new String[fileColumnCount];
+//
+// if (included == null) {
+// for (int i = 0; i < fileColumnCount; ++i) {
+// int subtype = schemaStructType.getSubtypes(i);
+// this.fields[i] = createTreeReader(subtype, treeReaderSchema, included, skipCorrupt);
+// // Use the treeReaderSchema evolution name since file/reader types may not have the real column name.
+// this.fieldNames[i] = schemaStructType.getFieldNames(i);
+// }
+// } else {
+// for (int i = 0; i < fileColumnCount; ++i) {
+// int subtype = schemaStructType.getSubtypes(i);
+// if (subtype >= included.length) {
+// throw new IOException("subtype " + subtype + " exceeds the included array size " +
+// included.length + " fileTypes " + treeReaderSchema.getFileTypes().toString() +
+// " schemaTypes " + treeReaderSchema.getSchemaTypes().toString() +
+// " innerStructSubtype " + treeReaderSchema.getInnerStructSubtype());
+// }
+// if (included[subtype]) {
+// this.fields[i] = createTreeReader(subtype, treeReaderSchema, included, skipCorrupt);
+// }
+// // Use the treeReaderSchema evolution name since file/reader types may not have the real column name.
+// this.fieldNames[i] = schemaStructType.getFieldNames(i);
+// }
+// }
+// }
+//
+// @Override
+// void seek(PositionProvider[] index) throws IOException {
+// super.seek(index);
+// for (TreeReader kid : fields) {
+// if (kid != null) {
+// kid.seek(index);
+// }
+// }
+// }
+//
+// @Override
+// Object next(Object previous) throws IOException {
+// super.next(previous);
+// OrcStruct result = null;
+// if (valuePresent) {
+// if (previous == null) {
+// result = new OrcStruct(resultColumnCount);
+// } else {
+// result = (OrcStruct) previous;
+//
+// // If the input format was initialized with a file with a
+// // different number of fields, the number of fields needs to
+// // be updated to the correct number
+// if (result.getNumFields() != resultColumnCount) {
+// result.setNumFields(resultColumnCount);
+// }
+// }
+// for (int i = 0; i < fileColumnCount; ++i) {
+// if (fields[i] != null) {
+// result.setFieldValue(i, fields[i].next(result.getFieldValue(i)));
+// }
+// }
+// if (resultColumnCount > fileColumnCount) {
+// for (int i = fileColumnCount; i < resultColumnCount; ++i) {
+// // Default new treeReaderSchema evolution fields to NULL.
+// result.setFieldValue(i, null);
+// }
+// }
+// }
+// return result;
+// }
+//
+// @Override
+// void startStripe(Map streams,
+// OrcProto.StripeFooter stripeFooter
+// ) throws IOException {
+// super.startStripe(streams, stripeFooter);
+// for (TreeReader field : fields) {
+// if (field != null) {
+// field.startStripe(streams, stripeFooter);
+// }
+// }
+// }
+//
+// @Override
+// void skipRows(long items) throws IOException {
+// items = countNonNulls(items);
+// for (TreeReader field : fields) {
+// if (field != null) {
+// field.skipRows(items);
+// }
+// }
+// }
+// }
+
+ public static DatumTreeReader createTreeReader(TimeZone timeZone,
+ int columnId,
+ Column column,
+ boolean skipCorrupt
+ ) throws IOException {
+ TypeDesc typeDesc = column.getTypeDesc();
+ int orcColumnId = columnId + 1; // root record column is considered
+ switch (typeDesc.getDataType().getType()) {
+ case BOOLEAN:
+ return new BooleanTreeReader(orcColumnId);
+ case BIT:
+ return new ByteTreeReader(orcColumnId);
+ case FLOAT8:
+ return new DoubleTreeReader(orcColumnId);
+ case FLOAT4:
+ return new FloatTreeReader(orcColumnId);
+ case INT2:
+ return new ShortTreeReader(orcColumnId);
+ case INT4:
+ return new IntTreeReader(orcColumnId);
+ case INT8:
+ return new LongTreeReader(orcColumnId, skipCorrupt);
+ case TEXT:
+ return new StringTreeReader(orcColumnId);
+ case CHAR:
+ return new CharTreeReader(orcColumnId, typeDesc.getDataType().getLength());
+ case BLOB:
+ return new BinaryTreeReader(orcColumnId);
+ case TIMESTAMP:
+ return new TimestampTreeReader(timeZone, orcColumnId, skipCorrupt);
+ case DATE:
+ return new DateTreeReader(orcColumnId);
+ case INET4:
+ return new InetTreeReader(orcColumnId);
+// case STRUCT:
+// return new StructTreeReader(columnId, treeReaderSchema, included, skipCorrupt);
+ default:
+ throw new TajoRuntimeException(new UnsupportedException("Unsupported type " +
+ typeDesc.getDataType().getType().name()));
+ }
+ }
+}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java
index 669b44fbd3..2c85aa6653 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java
@@ -18,6 +18,8 @@
package org.apache.tajo.storage.thirdparty.orc;
+import org.apache.orc.OrcProto;
+import org.apache.orc.StripeInformation;
import org.apache.tajo.storage.Tuple;
import java.io.IOException;
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java
index 833d102744..e0ad3d7bed 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java
@@ -19,7 +19,6 @@
package org.apache.tajo.storage.thirdparty.orc;
import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.google.common.primitives.Longs;
import com.google.protobuf.ByteString;
@@ -30,21 +29,20 @@
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.io.IOConstants;
-import org.apache.hadoop.hive.shims.ShimLoader;
-import org.apache.tajo.datum.*;
-import org.apache.tajo.storage.Tuple;
-import org.apache.tajo.storage.thirdparty.orc.CompressionCodec.Modifier;
-import org.apache.tajo.storage.thirdparty.orc.OrcProto.RowIndexEntry;
-import org.apache.tajo.storage.thirdparty.orc.OrcProto.StripeStatistics;
-import org.apache.tajo.storage.thirdparty.orc.OrcProto.Type;
-import org.apache.tajo.storage.thirdparty.orc.OrcProto.UserMetadataItem;
import org.apache.hadoop.hive.ql.util.JavaDataModel;
-import org.apache.hadoop.hive.serde2.objectinspector.*;
-import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.apache.hadoop.io.Text;
+import org.apache.orc.*;
+import org.apache.orc.CompressionCodec.Modifier;
+import org.apache.orc.OrcProto.RowIndexEntry;
+import org.apache.orc.OrcUtils;
+import org.apache.orc.impl.*;
+import org.apache.tajo.datum.Datum;
+import org.apache.tajo.datum.Inet4Datum;
+import org.apache.tajo.datum.Int4Datum;
+import org.apache.tajo.datum.Int8Datum;
+import org.apache.tajo.storage.Tuple;
+import org.apache.tajo.storage.thirdparty.orc.OrcFile.*;
+import org.apache.tajo.util.datetime.DateTimeConstants;
import org.apache.tajo.util.datetime.DateTimeUtil;
import java.io.IOException;
@@ -94,10 +92,11 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
private final boolean addBlockPadding;
private final int bufferSize;
private final long blockSize;
- private final float paddingTolerance;
+ private final double paddingTolerance;
+ private final TypeDescription schema;
+
// the streams that make up the current stripe
- private final Map streams =
- new TreeMap<>();
+ private final Map streams = new TreeMap<>();
private FSDataOutputStream rawWriter = null;
// the compressed metadata information outStream
@@ -111,47 +110,32 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
private long rawDataSize = 0;
private int rowsInIndex = 0;
private int stripesAtLastFlush = -1;
- private final List stripes =
- new ArrayList<>();
- private final Map userMetadata =
- new TreeMap<>();
+ private final List stripes = new ArrayList<>();
+ private final Map userMetadata = new TreeMap<>();
+ private final StreamFactory streamFactory = new StreamFactory();
private final TreeWriter treeWriter;
private final boolean buildIndex;
private final MemoryManager memoryManager;
- private final OrcFile.Version version;
+ private final Version version;
private final Configuration conf;
- private final OrcFile.WriterCallback callback;
- private final OrcFile.WriterContext callbackContext;
- private final OrcFile.EncodingStrategy encodingStrategy;
- private final OrcFile.CompressionStrategy compressionStrategy;
+ private final WriterCallback callback;
+ private final WriterContext callbackContext;
+ private final EncodingStrategy encodingStrategy;
+ private final CompressionStrategy compressionStrategy;
private final boolean[] bloomFilterColumns;
private final double bloomFilterFpp;
private boolean writeTimeZone;
private TimeZone timeZone;
- WriterImpl(FileSystem fs,
- Path path,
- Configuration conf,
- ObjectInspector inspector,
- long stripeSize,
- CompressionKind compress,
- int bufferSize,
- int rowIndexStride,
- MemoryManager memoryManager,
- boolean addBlockPadding,
- OrcFile.Version version,
- OrcFile.WriterCallback callback,
- OrcFile.EncodingStrategy encodingStrategy,
- OrcFile.CompressionStrategy compressionStrategy,
- float paddingTolerance,
- long blockSizeValue,
- String bloomFilterColumnNames,
- double bloomFilterFpp,
- TimeZone timeZone) throws IOException {
+ public WriterImpl(FileSystem fs,
+ Path path,
+ OrcFile.WriterOptions opts,
+ TimeZone timeZone) throws IOException {
this.fs = fs;
this.path = path;
- this.conf = conf;
- this.callback = callback;
+ this.conf = opts.getConfiguration();
+ this.callback = opts.getCallback();
+ this.schema = opts.getSchema();
if (callback != null) {
callbackContext = new OrcFile.WriterContext(){
@@ -163,100 +147,60 @@ public Writer getWriter() {
} else {
callbackContext = null;
}
- this.adjustedStripeSize = stripeSize;
- this.defaultStripeSize = stripeSize;
- this.version = version;
- this.encodingStrategy = encodingStrategy;
- this.compressionStrategy = compressionStrategy;
- this.addBlockPadding = addBlockPadding;
- this.blockSize = blockSizeValue;
- this.paddingTolerance = paddingTolerance;
- this.compress = compress;
- this.rowIndexStride = rowIndexStride;
- this.memoryManager = memoryManager;
- this.timeZone = timeZone;
+ this.adjustedStripeSize = opts.getStripeSize();
+ this.defaultStripeSize = opts.getStripeSize();
+ this.version = opts.getVersion();
+ this.encodingStrategy = opts.getEncodingStrategy();
+ this.compressionStrategy = opts.getCompressionStrategy();
+ this.addBlockPadding = opts.getBlockPadding();
+ this.blockSize = opts.getBlockSize();
+ this.paddingTolerance = opts.getPaddingTolerance();
+ this.compress = opts.getCompress();
+ this.rowIndexStride = opts.getRowIndexStride();
+ this.memoryManager = opts.getMemoryManager();
buildIndex = rowIndexStride > 0;
codec = createCodec(compress);
- String allColumns = conf.get(IOConstants.COLUMNS);
- if (allColumns == null) {
- allColumns = getColumnNamesFromInspector(inspector);
- }
- this.bufferSize = getEstimatedBufferSize(allColumns, bufferSize);
+ int numColumns = schema.getMaximumId() + 1;
+ this.bufferSize = getEstimatedBufferSize(defaultStripeSize,
+ numColumns, opts.getBufferSize());
if (version == OrcFile.Version.V_0_11) {
/* do not write bloom filters for ORC v11 */
- this.bloomFilterColumns =
- OrcUtils.includeColumns(null, allColumns, inspector);
+ this.bloomFilterColumns = new boolean[schema.getMaximumId() + 1];
} else {
this.bloomFilterColumns =
- OrcUtils.includeColumns(bloomFilterColumnNames, allColumns, inspector);
+ OrcUtils.includeColumns(opts.getBloomFilterColumns(), schema);
}
- this.bloomFilterFpp = bloomFilterFpp;
- treeWriter = createTreeWriter(inspector, new StreamFactory(), false);
+ this.bloomFilterFpp = opts.getBloomFilterFpp();
+ this.timeZone = timeZone;
+ treeWriter = createTreeWriter(schema, streamFactory, false);
if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) {
throw new IllegalArgumentException("Row stride must be at least " +
MIN_ROW_INDEX_STRIDE);
}
// ensure that we are able to handle callbacks before we register ourselves
- memoryManager.addWriter(path, stripeSize, this);
- }
-
- private String getColumnNamesFromInspector(ObjectInspector inspector) {
- List fieldNames = Lists.newArrayList();
- Joiner joiner = Joiner.on(",");
- if (inspector instanceof StructObjectInspector) {
- StructObjectInspector soi = (StructObjectInspector) inspector;
- List extends StructField> fields = soi.getAllStructFieldRefs();
- for(StructField sf : fields) {
- fieldNames.add(sf.getFieldName());
- }
- }
- return joiner.join(fieldNames);
+ memoryManager.addWriter(path, opts.getStripeSize(), this);
}
@VisibleForTesting
- int getEstimatedBufferSize(int bs) {
- return getEstimatedBufferSize(conf.get(IOConstants.COLUMNS), bs);
- }
-
- int getEstimatedBufferSize(String colNames, int bs) {
- long availableMem = getMemoryAvailableForORC();
- if (colNames != null) {
- final int numCols = colNames.split(",").length;
- if (numCols > COLUMN_COUNT_THRESHOLD) {
- // In BufferedStream, there are 3 outstream buffers (compressed,
- // uncompressed and overflow) and list of previously compressed buffers.
- // Since overflow buffer is rarely used, lets consider only 2 allocation.
- // Also, initially, the list of compression buffers will be empty.
- final int outStreamBuffers = codec == null ? 1 : 2;
-
- // max possible streams per column is 5. For string columns, there is
- // ROW_INDEX, PRESENT, DATA, LENGTH, DICTIONARY_DATA streams.
- final int maxStreams = 5;
-
- // Lets assume 10% memory for holding dictionary in memory and other
- // object allocations
- final long miscAllocation = (long) (0.1f * availableMem);
-
- // compute the available memory
- final long remainingMem = availableMem - miscAllocation;
-
- int estBufferSize = (int) (remainingMem /
- (maxStreams * outStreamBuffers * numCols));
- estBufferSize = getClosestBufferSize(estBufferSize, bs);
- if (estBufferSize > bs) {
- estBufferSize = bs;
- }
-
- LOG.info("WIDE TABLE - Number of columns: " + numCols +
- " Chosen compression buffer size: " + estBufferSize);
- return estBufferSize;
- }
+ public static int getEstimatedBufferSize(long stripeSize, int numColumns,
+ int bs) {
+ // The worst case is that there are 2 big streams per a column and
+ // we want to guarantee that each stream gets ~10 buffers.
+ // This keeps buffers small enough that we don't get really small stripe
+ // sizes.
+ int estBufferSize = (int) (stripeSize / (20 * numColumns));
+ estBufferSize = getClosestBufferSize(estBufferSize);
+ if (estBufferSize > bs) {
+ estBufferSize = bs;
+ } else {
+ LOG.info("WIDE TABLE - Number of columns: " + numColumns +
+ " Chosen compression buffer size: " + estBufferSize);
}
- return bs;
+ return estBufferSize;
}
- private int getClosestBufferSize(int estBufferSize, int bs) {
+ private static int getClosestBufferSize(int estBufferSize) {
final int kb4 = 4 * 1024;
final int kb8 = 8 * 1024;
final int kb16 = 16 * 1024;
@@ -616,8 +560,7 @@ public TimeZone getTimeZone() {
*/
private abstract static class TreeWriter {
protected final int id;
- protected final ObjectInspector inspector;
- private final BitFieldWriter isPresent;
+ protected final BitFieldWriter isPresent;
private final boolean isCompressed;
protected final ColumnStatisticsImpl indexStatistics;
protected final ColumnStatisticsImpl stripeColStatistics;
@@ -634,24 +577,24 @@ private abstract static class TreeWriter {
private final OrcProto.BloomFilter.Builder bloomFilterEntry;
private boolean foundNulls;
private OutStream isPresentOutStream;
- private final List stripeStatsBuilders;
+ private final List stripeStatsBuilders;
private final StreamFactory streamFactory;
/**
* Create a tree writer.
* @param columnId the column id of the column to write
- * @param inspector the object inspector to use
+ * @param schema the row schema
* @param streamFactory limited access to the Writer's data.
* @param nullable can the value be null?
* @throws IOException
*/
- TreeWriter(int columnId, ObjectInspector inspector,
+ TreeWriter(int columnId,
+ TypeDescription schema,
StreamFactory streamFactory,
boolean nullable) throws IOException {
this.streamFactory = streamFactory;
this.isCompressed = streamFactory.isCompressed();
this.id = columnId;
- this.inspector = inspector;
if (nullable) {
isPresentOutStream = streamFactory.createStream(id,
OrcProto.Stream.Kind.PRESENT);
@@ -661,9 +604,9 @@ private abstract static class TreeWriter {
}
this.foundNulls = false;
createBloomFilter = streamFactory.getBloomFilterColumns()[columnId];
- indexStatistics = ColumnStatisticsImpl.create(inspector);
- stripeColStatistics = ColumnStatisticsImpl.create(inspector);
- fileStatistics = ColumnStatisticsImpl.create(inspector);
+ indexStatistics = ColumnStatisticsImpl.create(schema);
+ stripeColStatistics = ColumnStatisticsImpl.create(schema);
+ fileStatistics = ColumnStatisticsImpl.create(schema);
childrenWriters = new TreeWriter[0];
rowIndex = OrcProto.RowIndex.newBuilder();
rowIndexEntry = OrcProto.RowIndexEntry.newBuilder();
@@ -912,10 +855,10 @@ private static class BooleanTreeWriter extends TreeWriter {
private final BitFieldWriter writer;
BooleanTreeWriter(int columnId,
- ObjectInspector inspector,
+ TypeDescription schema,
StreamFactory writer,
boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ super(columnId, schema, writer, nullable);
PositionedOutputStream out = writer.createStream(id,
OrcProto.Stream.Kind.DATA);
this.writer = new BitFieldWriter(out, 1);
@@ -927,7 +870,7 @@ void write(Datum datum) throws IOException {
super.write(datum);
if (datum != null && datum.isNotNull()) {
boolean val = datum.asBool();
- indexStatistics.updateBoolean(val);
+ indexStatistics.updateBoolean(val, 1);
writer.write(val ? 1 : 0);
}
}
@@ -951,10 +894,10 @@ private static class ByteTreeWriter extends TreeWriter {
private final RunLengthByteWriter writer;
ByteTreeWriter(int columnId,
- ObjectInspector inspector,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
this.writer = new RunLengthByteWriter(writer.createStream(id,
OrcProto.Stream.Kind.DATA));
recordPosition(rowIndexPosition);
@@ -965,7 +908,7 @@ void write(Datum datum) throws IOException {
super.write(datum);
if (datum != null && datum.isNotNull()) {
byte val = datum.asByte();
- indexStatistics.updateInteger(val);
+ indexStatistics.updateInteger(val, 1);
if (createBloomFilter) {
bloomFilter.addLong(val);
}
@@ -993,10 +936,10 @@ private static class IntegerTreeWriter extends TreeWriter {
private boolean isDirectV2 = true;
IntegerTreeWriter(int columnId,
- ObjectInspector inspector,
+ TypeDescription schema,
StreamFactory writer,
boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ super(columnId, schema, writer, nullable);
OutStream out = writer.createStream(id,
OrcProto.Stream.Kind.DATA);
this.isDirectV2 = isNewWriteFormat(writer);
@@ -1026,7 +969,7 @@ void write(Datum datum) throws IOException {
} else {
val = datum.asInt2();
}
- indexStatistics.updateInteger(val);
+ indexStatistics.updateInteger(val, 1);
if (createBloomFilter) {
// integers are converted to longs in column statistics and during SARG evaluation
bloomFilter.addLong(val);
@@ -1055,10 +998,10 @@ private static class FloatTreeWriter extends TreeWriter {
private final SerializationUtils utils;
FloatTreeWriter(int columnId,
- ObjectInspector inspector,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
this.stream = writer.createStream(id,
OrcProto.Stream.Kind.DATA);
this.utils = new SerializationUtils();
@@ -1099,10 +1042,10 @@ private static class DoubleTreeWriter extends TreeWriter {
private final SerializationUtils utils;
DoubleTreeWriter(int columnId,
- ObjectInspector inspector,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
this.stream = writer.createStream(id,
OrcProto.Stream.Kind.DATA);
this.utils = new SerializationUtils();
@@ -1137,33 +1080,33 @@ void recordPosition(PositionRecorder recorder) throws IOException {
}
}
- private static class StringTreeWriter extends TreeWriter {
+ private static abstract class StringBaseTreeWriter extends TreeWriter {
private static final int INITIAL_DICTIONARY_SIZE = 4096;
private final OutStream stringOutput;
private final IntegerWriter lengthOutput;
private final IntegerWriter rowOutput;
- private final StringRedBlackTree dictionary =
+ protected final StringRedBlackTree dictionary =
new StringRedBlackTree(INITIAL_DICTIONARY_SIZE);
- private final DynamicIntArray rows = new DynamicIntArray();
- private final PositionedOutputStream directStreamOutput;
- private final IntegerWriter directLengthOutput;
- private final List savedRowIndex =
- new ArrayList<>();
+ protected final DynamicIntArray rows = new DynamicIntArray();
+ protected final PositionedOutputStream directStreamOutput;
+ protected final IntegerWriter directLengthOutput;
+ private final List savedRowIndex =
+ new ArrayList();
private final boolean buildIndex;
- private final List rowIndexValueCount = new ArrayList<>();
+ private final List rowIndexValueCount = new ArrayList();
// If the number of keys in a dictionary is greater than this fraction of
//the total number of non-null rows, turn off dictionary encoding
- private final float dictionaryKeySizeThreshold;
- private boolean useDictionaryEncoding = true;
+ private final double dictionaryKeySizeThreshold;
+ protected boolean useDictionaryEncoding = true;
private boolean isDirectV2 = true;
private boolean doneDictionaryCheck;
- private final boolean strideDictionaryCheck;
+ protected final boolean strideDictionaryCheck;
- StringTreeWriter(int columnId,
- ObjectInspector inspector,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ StringBaseTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
this.isDirectV2 = isNewWriteFormat(writer);
stringOutput = writer.createStream(id,
OrcProto.Stream.Kind.DICTIONARY_DATA);
@@ -1177,33 +1120,14 @@ private static class StringTreeWriter extends TreeWriter {
directStreamOutput = writer.createStream(id, OrcProto.Stream.Kind.DATA);
directLengthOutput = createIntegerWriter(writer.createStream(id,
OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer);
- dictionaryKeySizeThreshold = writer.getConfiguration().getFloat(
- OrcConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname,
- OrcConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.defaultFloatVal);
- strideDictionaryCheck = writer.getConfiguration().getBoolean(
- OrcConf.ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname,
- OrcConf.ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.defaultBoolVal);
+ Configuration conf = writer.getConfiguration();
+ dictionaryKeySizeThreshold =
+ org.apache.orc.OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getDouble(conf);
+ strideDictionaryCheck =
+ org.apache.orc.OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getBoolean(conf);
doneDictionaryCheck = false;
}
- @Override
- void write(Datum datum) throws IOException {
- super.write(datum);
- if (datum != null && datum.isNotNull()) {
- if (useDictionaryEncoding || !strideDictionaryCheck) {
- rows.add(dictionary.add(datum.toString()));
- } else {
- // write data and length
- directStreamOutput.write(datum.asByteArray(), 0, datum.size());
- directLengthOutput.write(datum.size());
- }
- indexStatistics.updateString(datum.toString());
- if (createBloomFilter) {
- bloomFilter.addBytes(datum.asByteArray(), datum.size());
- }
- }
- }
-
private boolean checkDictionaryEncoding() {
if (!doneDictionaryCheck) {
// Set the flag indicating whether or not to use dictionary encoding
@@ -1269,7 +1193,7 @@ private void flushDictionary() throws IOException {
private int currentId = 0;
@Override
public void visit(StringRedBlackTree.VisitorContext context
- ) throws IOException {
+ ) throws IOException {
context.writeBytes(stringOutput);
lengthOutput.write(context.getLength());
dumpOrder[context.getOriginalPosition()] = currentId++;
@@ -1383,29 +1307,76 @@ long estimateMemory() {
}
}
+ private static class StringTreeWriter extends StringBaseTreeWriter {
+ StringTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ }
+
+ @Override
+ void write(Datum datum) throws IOException {
+ super.write(datum);
+ if (datum != null && datum.isNotNull()) {
+ if (useDictionaryEncoding || !strideDictionaryCheck) {
+ rows.add(dictionary.add(datum.toString()));
+ } else {
+ // write data and length
+ directStreamOutput.write(datum.asByteArray(), 0, datum.size());
+ directLengthOutput.write(datum.size());
+ }
+ byte[] buf = datum.asByteArray();
+ indexStatistics.updateString(buf, 0, buf.length, 1);
+ if (createBloomFilter) {
+ bloomFilter.addBytes(buf, 0, buf.length);
+ }
+ }
+ }
+ }
+
/**
* Under the covers, char is written to ORC the same way as string.
*/
private static class CharTreeWriter extends StringTreeWriter {
+ private final int itemLength;
+ private final byte[] padding;
CharTreeWriter(int columnId,
- ObjectInspector inspector,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ itemLength = schema.getMaxLength();
+ padding = new byte[itemLength];
}
- }
- /**
- * Under the covers, varchar is written to ORC the same way as string.
- */
- private static class VarcharTreeWriter extends StringTreeWriter {
+ @Override
+ void write(Datum datum) throws IOException {
+ super.write(datum);
+ if (datum != null && datum.isNotNull()) {
+ byte[] ptr;
+ byte[] buf = datum.asByteArray();
+ if (buf.length >= itemLength) {
+ ptr = buf;
+ } else {
+ ptr = padding;
+ System.arraycopy(buf, 0, ptr, 0, buf.length);
+ Arrays.fill(ptr, buf.length, itemLength, (byte) ' ');
+ }
+ if (useDictionaryEncoding || !strideDictionaryCheck) {
+ rows.add(dictionary.add(ptr, 0, itemLength));
+ } else {
+ // write data and length
+ directStreamOutput.write(ptr, 0, itemLength);
+ directLengthOutput.write(itemLength);
+ }
- VarcharTreeWriter(int columnId,
- ObjectInspector inspector,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ indexStatistics.updateString(ptr, 0, ptr.length, 1);
+ if (createBloomFilter) {
+ bloomFilter.addBytes(ptr, 0, ptr.length);
+ }
+ }
}
}
@@ -1415,10 +1386,10 @@ private static class BinaryTreeWriter extends TreeWriter {
private boolean isDirectV2 = true;
BinaryTreeWriter(int columnId,
- ObjectInspector inspector,
+ TypeDescription schema,
StreamFactory writer,
boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ super(columnId, schema, writer, nullable);
this.stream = writer.createStream(id,
OrcProto.Stream.Kind.DATA);
this.isDirectV2 = isNewWriteFormat(writer);
@@ -1441,11 +1412,12 @@ OrcProto.ColumnEncoding getEncoding() {
void write(Datum datum) throws IOException {
super.write(datum);
if (datum != null && datum.isNotNull()) {
- stream.write(datum.asByteArray(), 0, datum.size());
+ byte[] buf = datum.asByteArray();
+ stream.write(buf, 0, buf.length);
length.write(datum.size());
- indexStatistics.updateBinary(datum);
+ indexStatistics.updateBinary(buf, 0, buf.length, 1);
if (createBloomFilter) {
- bloomFilter.addBytes(datum.asByteArray(), datum.size());
+ bloomFilter.addBytes(buf, 0, buf.length);
}
}
}
@@ -1467,7 +1439,6 @@ void recordPosition(PositionRecorder recorder) throws IOException {
}
}
- static final int MILLIS_PER_SECOND = 1000;
static final String BASE_TIMESTAMP_STRING = "2015-01-01 00:00:00";
private static class TimestampTreeWriter extends TreeWriter {
@@ -1478,10 +1449,10 @@ private static class TimestampTreeWriter extends TreeWriter {
private TimeZone timeZone;
TimestampTreeWriter(int columnId,
- ObjectInspector inspector,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
this.isDirectV2 = isNewWriteFormat(writer);
this.seconds = createIntegerWriter(writer.createStream(id,
OrcProto.Stream.Kind.DATA), true, isDirectV2, writer);
@@ -1489,7 +1460,7 @@ private static class TimestampTreeWriter extends TreeWriter {
OrcProto.Stream.Kind.SECONDARY), false, isDirectV2, writer);
recordPosition(rowIndexPosition);
// for unit tests to set different time zones
- this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / MILLIS_PER_SECOND;
+ this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / DateTimeConstants.MSECS_PER_SEC;
writer.useWriterTimeZone(true);
timeZone = writer.getTimeZone();
}
@@ -1515,7 +1486,7 @@ void write(Datum datum) throws IOException {
Timestamp val = new Timestamp(javaTimestamp);
indexStatistics.updateTimestamp(val);
- seconds.write((val.getTime() / MILLIS_PER_SECOND) - base_timestamp);
+ seconds.write((val.getTime() / DateTimeConstants.MSECS_PER_SEC) - base_timestamp);
nanos.write(formatNanos(val.getNanos()));
if (createBloomFilter) {
bloomFilter.addLong(val.getTime());
@@ -1561,12 +1532,12 @@ private static class DateTreeWriter extends TreeWriter {
private final boolean isDirectV2;
DateTreeWriter(int columnId,
- ObjectInspector inspector,
+ TypeDescription schema,
StreamFactory writer,
boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ super(columnId, schema, writer, nullable);
OutStream out = writer.createStream(id,
- OrcProto.Stream.Kind.DATA);
+ OrcProto.Stream.Kind.DATA);
this.isDirectV2 = isNewWriteFormat(writer);
this.writer = createIntegerWriter(out, true, isDirectV2, writer);
recordPosition(rowIndexPosition);
@@ -1612,19 +1583,17 @@ OrcProto.ColumnEncoding getEncoding() {
}
private static class StructTreeWriter extends TreeWriter {
- private final List extends StructField> fields;
StructTreeWriter(int columnId,
- ObjectInspector inspector,
+ TypeDescription schema,
StreamFactory writer,
boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
- StructObjectInspector structObjectInspector =
- (StructObjectInspector) inspector;
- fields = structObjectInspector.getAllStructFieldRefs();
- childrenWriters = new TreeWriter[fields.size()];
+ super(columnId, schema, writer, nullable);
+ List children = schema.getChildren();
+ childrenWriters = new TreeWriter[children.size()];
for(int i=0; i < childrenWriters.length; ++i) {
childrenWriters[i] = createTreeWriter(
- fields.get(i).getFieldObjectInspector(), writer, true);
+ children.get(i), writer,
+ true);
}
recordPosition(rowIndexPosition);
}
@@ -1636,9 +1605,8 @@ void write(Datum datum) throws IOException {
void writeTuple(Tuple tuple) throws IOException {
super.write(tuple);
if (tuple != null) {
- for(int i = 0; i < fields.size(); ++i) {
- TreeWriter writer = childrenWriters[i];
- writer.write(tuple.asDatum(i));
+ for(int i = 0; i < childrenWriters.length; ++i) {
+ childrenWriters[i].write(tuple.asDatum(i));
}
}
}
@@ -1654,159 +1622,136 @@ void writeStripe(OrcProto.StripeFooter.Builder builder,
}
}
- private static TreeWriter createTreeWriter(ObjectInspector inspector,
+ private static TreeWriter createTreeWriter(TypeDescription schema,
StreamFactory streamFactory,
boolean nullable) throws IOException {
- switch (inspector.getCategory()) {
- case PRIMITIVE:
- switch (((PrimitiveObjectInspector) inspector).getPrimitiveCategory()) {
- case BOOLEAN:
- case VOID:
- return new BooleanTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case BYTE:
- return new ByteTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case SHORT:
- case INT:
- case LONG:
- return new IntegerTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case FLOAT:
- return new FloatTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case DOUBLE:
- return new DoubleTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case STRING:
- return new StringTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case CHAR:
- return new CharTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case VARCHAR:
- return new VarcharTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case BINARY:
- return new BinaryTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case TIMESTAMP:
- return new TimestampTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case DATE:
- return new DateTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- default:
- throw new IllegalArgumentException("Bad primitive category " +
- ((PrimitiveObjectInspector) inspector).getPrimitiveCategory());
- }
+ switch (schema.getCategory()) {
+ case BOOLEAN:
+ return new BooleanTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case BYTE:
+ return new ByteTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case SHORT:
+ case INT:
+ case LONG:
+ return new IntegerTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case FLOAT:
+ return new FloatTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case DOUBLE:
+ return new DoubleTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case STRING:
+ return new StringTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case CHAR:
+ return new CharTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case BINARY:
+ return new BinaryTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case TIMESTAMP:
+ return new TimestampTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case DATE:
+ return new DateTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
case STRUCT:
- return new StructTreeWriter(streamFactory.getNextColumnId(), inspector,
- streamFactory, nullable);
+ return new StructTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
default:
throw new IllegalArgumentException("Bad category: " +
- inspector.getCategory());
+ schema.getCategory());
}
}
private static void writeTypes(OrcProto.Footer.Builder builder,
- TreeWriter treeWriter) {
+ TypeDescription schema) {
OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
- switch (treeWriter.inspector.getCategory()) {
- case PRIMITIVE:
- switch (((PrimitiveObjectInspector) treeWriter.inspector).
- getPrimitiveCategory()) {
- case VOID:
- case BOOLEAN:
- type.setKind(OrcProto.Type.Kind.BOOLEAN);
- break;
- case BYTE:
- type.setKind(OrcProto.Type.Kind.BYTE);
- break;
- case SHORT:
- type.setKind(OrcProto.Type.Kind.SHORT);
- break;
- case INT:
- type.setKind(OrcProto.Type.Kind.INT);
- break;
- case LONG:
- type.setKind(OrcProto.Type.Kind.LONG);
- break;
- case FLOAT:
- type.setKind(OrcProto.Type.Kind.FLOAT);
- break;
- case DOUBLE:
- type.setKind(OrcProto.Type.Kind.DOUBLE);
- break;
- case STRING:
- type.setKind(OrcProto.Type.Kind.STRING);
- break;
- case CHAR:
- // The char length needs to be written to file and should be available
- // from the object inspector
- CharTypeInfo charTypeInfo = (CharTypeInfo) ((PrimitiveObjectInspector) treeWriter.inspector).getTypeInfo();
- type.setKind(Type.Kind.CHAR);
- type.setMaximumLength(charTypeInfo.getLength());
- break;
- case VARCHAR:
- // The varchar length needs to be written to file and should be available
- // from the object inspector
- VarcharTypeInfo typeInfo = (VarcharTypeInfo) ((PrimitiveObjectInspector) treeWriter.inspector).getTypeInfo();
- type.setKind(Type.Kind.VARCHAR);
- type.setMaximumLength(typeInfo.getLength());
- break;
- case BINARY:
- type.setKind(OrcProto.Type.Kind.BINARY);
- break;
- case TIMESTAMP:
- type.setKind(OrcProto.Type.Kind.TIMESTAMP);
- break;
- case DATE:
- type.setKind(OrcProto.Type.Kind.DATE);
- break;
- case DECIMAL:
- DecimalTypeInfo decTypeInfo = (DecimalTypeInfo)((PrimitiveObjectInspector)treeWriter.inspector).getTypeInfo();
- type.setKind(OrcProto.Type.Kind.DECIMAL);
- type.setPrecision(decTypeInfo.precision());
- type.setScale(decTypeInfo.scale());
- break;
- default:
- throw new IllegalArgumentException("Unknown primitive category: " +
- ((PrimitiveObjectInspector) treeWriter.inspector).
- getPrimitiveCategory());
- }
+ List children = schema.getChildren();
+ switch (schema.getCategory()) {
+ case BOOLEAN:
+ type.setKind(OrcProto.Type.Kind.BOOLEAN);
+ break;
+ case BYTE:
+ type.setKind(OrcProto.Type.Kind.BYTE);
+ break;
+ case SHORT:
+ type.setKind(OrcProto.Type.Kind.SHORT);
+ break;
+ case INT:
+ type.setKind(OrcProto.Type.Kind.INT);
+ break;
+ case LONG:
+ type.setKind(OrcProto.Type.Kind.LONG);
+ break;
+ case FLOAT:
+ type.setKind(OrcProto.Type.Kind.FLOAT);
+ break;
+ case DOUBLE:
+ type.setKind(OrcProto.Type.Kind.DOUBLE);
+ break;
+ case STRING:
+ type.setKind(OrcProto.Type.Kind.STRING);
+ break;
+ case CHAR:
+ type.setKind(OrcProto.Type.Kind.CHAR);
+ type.setMaximumLength(schema.getMaxLength());
+ break;
+ case VARCHAR:
+ type.setKind(OrcProto.Type.Kind.VARCHAR);
+ type.setMaximumLength(schema.getMaxLength());
+ break;
+ case BINARY:
+ type.setKind(OrcProto.Type.Kind.BINARY);
+ break;
+ case TIMESTAMP:
+ type.setKind(OrcProto.Type.Kind.TIMESTAMP);
+ break;
+ case DATE:
+ type.setKind(OrcProto.Type.Kind.DATE);
+ break;
+ case DECIMAL:
+ type.setKind(OrcProto.Type.Kind.DECIMAL);
+ type.setPrecision(schema.getPrecision());
+ type.setScale(schema.getScale());
break;
case LIST:
type.setKind(OrcProto.Type.Kind.LIST);
- type.addSubtypes(treeWriter.childrenWriters[0].id);
+ type.addSubtypes(children.get(0).getId());
break;
case MAP:
type.setKind(OrcProto.Type.Kind.MAP);
- type.addSubtypes(treeWriter.childrenWriters[0].id);
- type.addSubtypes(treeWriter.childrenWriters[1].id);
+ for(TypeDescription t: children) {
+ type.addSubtypes(t.getId());
+ }
break;
case STRUCT:
type.setKind(OrcProto.Type.Kind.STRUCT);
- for(TreeWriter child: treeWriter.childrenWriters) {
- type.addSubtypes(child.id);
+ for(TypeDescription t: children) {
+ type.addSubtypes(t.getId());
}
- for(StructField field: ((StructTreeWriter) treeWriter).fields) {
- type.addFieldNames(field.getFieldName());
+ for(String field: schema.getFieldNames()) {
+ type.addFieldNames(field);
}
break;
case UNION:
type.setKind(OrcProto.Type.Kind.UNION);
- for(TreeWriter child: treeWriter.childrenWriters) {
- type.addSubtypes(child.id);
+ for(TypeDescription t: children) {
+ type.addSubtypes(t.getId());
}
break;
default:
throw new IllegalArgumentException("Unknown category: " +
- treeWriter.inspector.getCategory());
+ schema.getCategory());
}
builder.addTypes(type);
- for(TreeWriter child: treeWriter.childrenWriters) {
- writeTypes(builder, child);
+ if (children != null) {
+ for(TypeDescription child: children) {
+ writeTypes(builder, child);
+ }
}
}
@@ -1853,9 +1798,9 @@ private void flushStripe() throws IOException {
StreamName name = pair.getKey();
long streamSize = pair.getValue().getOutputSize();
builder.addStreams(OrcProto.Stream.newBuilder()
- .setColumn(name.getColumn())
- .setKind(name.getKind())
- .setLength(streamSize));
+ .setColumn(name.getColumn())
+ .setKind(name.getKind())
+ .setLength(streamSize));
if (StreamName.Area.INDEX == name.getArea()) {
indexSize += streamSize;
} else {
@@ -1880,8 +1825,8 @@ private void flushStripe() throws IOException {
// and user specified padding tolerance. Since stripe size can overflow
// the default stripe size we should apply this correction to avoid
// writing portion of last stripe to next hdfs block.
- float correction = overflow > 0 ? (float) overflow
- / (float) adjustedStripeSize : 0.0f;
+ double correction = overflow > 0 ? (double) overflow
+ / (double) adjustedStripeSize : 0.0;
// correction should not be greater than user specified padding
// tolerance
@@ -1939,75 +1884,60 @@ private void flushStripe() throws IOException {
}
private long computeRawDataSize() {
- long result = 0;
- for (TreeWriter child : treeWriter.getChildrenWriters()) {
- result += getRawDataSizeFromInspectors(child, child.inspector);
- }
- return result;
+ return getRawDataSize(treeWriter, schema);
}
- private long getRawDataSizeFromInspectors(TreeWriter child, ObjectInspector oi) {
+ private long getRawDataSize(TreeWriter child,
+ TypeDescription schema) {
long total = 0;
- switch (oi.getCategory()) {
- case PRIMITIVE:
- total += getRawDataSizeFromPrimitives(child, oi);
- break;
- case LIST:
- case MAP:
- case UNION:
- case STRUCT:
- for (TreeWriter tw : child.childrenWriters) {
- total += getRawDataSizeFromInspectors(tw, tw.inspector);
+ long numVals = child.fileStatistics.getNumberOfValues();
+ switch (schema.getCategory()) {
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case FLOAT:
+ return numVals * JavaDataModel.get().primitive1();
+ case LONG:
+ case DOUBLE:
+ return numVals * JavaDataModel.get().primitive2();
+ case STRING:
+ case VARCHAR:
+ case CHAR:
+ // ORC strings are converted to java Strings. so use JavaDataModel to
+ // compute the overall size of strings
+ StringColumnStatistics scs = (StringColumnStatistics) child.fileStatistics;
+ numVals = numVals == 0 ? 1 : numVals;
+ int avgStringLen = (int) (scs.getSum() / numVals);
+ return numVals * JavaDataModel.get().lengthForStringOfLength(avgStringLen);
+ case DECIMAL:
+ return numVals * JavaDataModel.get().lengthOfDecimal();
+ case DATE:
+ return numVals * JavaDataModel.get().lengthOfDate();
+ case BINARY:
+ // get total length of binary blob
+ BinaryColumnStatistics bcs = (BinaryColumnStatistics) child.fileStatistics;
+ return bcs.getSum();
+ case TIMESTAMP:
+ return numVals * JavaDataModel.get().lengthOfTimestamp();
+ case LIST:
+ case MAP:
+ case UNION:
+ case STRUCT: {
+ TreeWriter[] childWriters = child.getChildrenWriters();
+ List childTypes = schema.getChildren();
+ for (int i=0; i < childWriters.length; ++i) {
+ total += getRawDataSize(childWriters[i], childTypes.get(i));
+ }
+ break;
}
- break;
- default:
- LOG.debug("Unknown object inspector category.");
- break;
+ default:
+ LOG.debug("Unknown object inspector category.");
+ break;
}
return total;
}
- private long getRawDataSizeFromPrimitives(TreeWriter child, ObjectInspector oi) {
- long result = 0;
- long numVals = child.fileStatistics.getNumberOfValues();
- switch (((PrimitiveObjectInspector) oi).getPrimitiveCategory()) {
- case BOOLEAN:
- case BYTE:
- case SHORT:
- case INT:
- case FLOAT:
- return numVals * JavaDataModel.get().primitive1();
- case LONG:
- case DOUBLE:
- return numVals * JavaDataModel.get().primitive2();
- case STRING:
- case VARCHAR:
- case CHAR:
- // ORC strings are converted to java Strings. so use JavaDataModel to
- // compute the overall size of strings
- child = (StringTreeWriter) child;
- StringColumnStatistics scs = (StringColumnStatistics) child.fileStatistics;
- numVals = numVals == 0 ? 1 : numVals;
- int avgStringLen = (int) (scs.getSum() / numVals);
- return numVals * JavaDataModel.get().lengthForStringOfLength(avgStringLen);
- case DECIMAL:
- return numVals * JavaDataModel.get().lengthOfDecimal();
- case DATE:
- return numVals * JavaDataModel.get().lengthOfDate();
- case BINARY:
- // get total length of binary blob
- BinaryColumnStatistics bcs = (BinaryColumnStatistics) child.fileStatistics;
- return bcs.getSum();
- case TIMESTAMP:
- return numVals * JavaDataModel.get().lengthOfTimestamp();
- default:
- LOG.debug("Unknown primitive category.");
- break;
- }
-
- return result;
- }
-
private OrcProto.CompressionKind writeCompressionKind(CompressionKind kind) {
switch (kind) {
case NONE: return OrcProto.CompressionKind.NONE;
@@ -2027,7 +1957,7 @@ private void writeFileStatistics(OrcProto.Footer.Builder builder,
}
}
- private int writeMetadata(long bodyLength) throws IOException {
+ private int writeMetadata() throws IOException {
getStream();
OrcProto.Metadata.Builder builder = OrcProto.Metadata.newBuilder();
for(OrcProto.StripeStatistics.Builder ssb : treeWriter.stripeStatsBuilders) {
@@ -2052,7 +1982,7 @@ private int writeFooter(long bodyLength) throws IOException {
// populate raw data size
rawDataSize = computeRawDataSize();
// serialize the types
- writeTypes(builder, treeWriter);
+ writeTypes(builder, schema);
// add the stripe information
for(OrcProto.StripeInformation stripe: stripes) {
builder.addStripes(stripe);
@@ -2062,7 +1992,7 @@ private int writeFooter(long bodyLength) throws IOException {
// add all of the user metadata
for(Map.Entry entry: userMetadata.entrySet()) {
builder.addMetadata(OrcProto.UserMetadataItem.newBuilder()
- .setName(entry.getKey()).setValue(entry.getValue()));
+ .setName(entry.getKey()).setValue(entry.getValue()));
}
long startPosn = rawWriter.getPos();
OrcProto.Footer footer = builder.build();
@@ -2074,14 +2004,14 @@ private int writeFooter(long bodyLength) throws IOException {
private int writePostScript(int footerLength, int metadataLength) throws IOException {
OrcProto.PostScript.Builder builder =
- OrcProto.PostScript.newBuilder()
- .setCompression(writeCompressionKind(compress))
- .setFooterLength(footerLength)
- .setMetadataLength(metadataLength)
- .setMagic(OrcFile.MAGIC)
- .addVersion(version.getMajor())
- .addVersion(version.getMinor())
- .setWriterVersion(OrcFile.WriterVersion.HIVE_8732.getId());
+ OrcProto.PostScript.newBuilder()
+ .setCompression(writeCompressionKind(compress))
+ .setFooterLength(footerLength)
+ .setMetadataLength(metadataLength)
+ .setMagic(OrcFile.MAGIC)
+ .addVersion(version.getMajor())
+ .addVersion(version.getMinor())
+ .setWriterVersion(OrcFile.CURRENT_WRITER.getId());
if (compress != CompressionKind.NONE) {
builder.setCompressionBlockSize(bufferSize);
}
@@ -2120,7 +2050,7 @@ public void addTuple(Tuple tuple) throws IOException {
createRowIndexEntry();
}
}
- memoryManager.addedRow();
+ memoryManager.addedRow(1);
}
@Override
@@ -2132,7 +2062,7 @@ public void close() throws IOException {
memoryManager.removeWriter(path);
// actually close the file
flushStripe();
- int metadataLength = writeMetadata(rawWriter.getPos());
+ int metadataLength = writeMetadata();
int footerLength = writeFooter(rawWriter.getPos() - metadataLength);
rawWriter.writeByte(writePostScript(footerLength, metadataLength));
rawWriter.close();
@@ -2165,19 +2095,19 @@ public long writeIntermediateFooter() throws IOException {
if (callback != null) {
callback.preFooterWrite(callbackContext);
}
- int metaLength = writeMetadata(rawWriter.getPos());
+ int metaLength = writeMetadata();
int footLength = writeFooter(rawWriter.getPos() - metaLength);
rawWriter.writeByte(writePostScript(footLength, metaLength));
stripesAtLastFlush = stripes.size();
- ShimLoader.getHadoopShims().hflush(rawWriter);
+ rawWriter.hflush();
}
return rawWriter.getPos();
}
@Override
public void appendStripe(byte[] stripe, int offset, int length,
- StripeInformation stripeInfo,
- OrcProto.StripeStatistics stripeStatistics) throws IOException {
+ StripeInformation stripeInfo,
+ OrcProto.StripeStatistics stripeStatistics) throws IOException {
checkArgument(stripe != null, "Stripe must not be null");
checkArgument(length <= stripe.length,
"Specified length must not be greater specified array length");
@@ -2187,12 +2117,11 @@ public void appendStripe(byte[] stripe, int offset, int length,
getStream();
long start = rawWriter.getPos();
- long stripeLen = length;
long availBlockSpace = blockSize - (start % blockSize);
// see if stripe can fit in the current hdfs block, else pad the remaining
// space in the block
- if (stripeLen < blockSize && stripeLen > availBlockSpace &&
+ if (length < blockSize && length > availBlockSpace &&
addBlockPadding) {
byte[] pad = new byte[(int) Math.min(HDFS_BUFFER_SIZE, availBlockSpace)];
LOG.info(String.format("Padding ORC by %d bytes while merging..",
@@ -2245,7 +2174,7 @@ private List getAllColumnTreeWriters(TreeWriter rootTreeWriter) {
}
private void getAllColumnTreeWritersImpl(TreeWriter tw,
- List result) {
+ List result) {
result.add(tw);
for (TreeWriter child : tw.childrenWriters) {
getAllColumnTreeWritersImpl(child, result);
@@ -2253,9 +2182,9 @@ private void getAllColumnTreeWritersImpl(TreeWriter tw,
}
@Override
- public void appendUserMetadata(List userMetadata) {
+ public void appendUserMetadata(List userMetadata) {
if (userMetadata != null) {
- for (UserMetadataItem item : userMetadata) {
+ for (OrcProto.UserMetadataItem item : userMetadata) {
this.userMetadata.put(item.getName(), item.getValue());
}
}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZeroCopyAdapter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZeroCopyAdapter.java
new file mode 100644
index 0000000000..2886fe7794
--- /dev/null
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZeroCopyAdapter.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.thirdparty.orc;
+
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.ReadOption;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.EnumSet;
+
+public class ZeroCopyAdapter {
+ private final FSDataInputStream in;
+ private final ByteBufferPoolAdapter pool;
+ private final static EnumSet CHECK_SUM = EnumSet
+ .noneOf(ReadOption.class);
+ private final static EnumSet NO_CHECK_SUM = EnumSet
+ .of(ReadOption.SKIP_CHECKSUMS);
+
+ public ZeroCopyAdapter(FSDataInputStream in, ByteBufferAllocatorPool poolshim) {
+ this.in = in;
+ if (poolshim != null) {
+ pool = new ByteBufferPoolAdapter(poolshim);
+ } else {
+ pool = null;
+ }
+ }
+
+ public final ByteBuffer readBuffer(int maxLength, boolean verifyChecksums)
+ throws IOException {
+ EnumSet options = NO_CHECK_SUM;
+ if (verifyChecksums) {
+ options = CHECK_SUM;
+ }
+ return this.in.read(this.pool, maxLength, options);
+ }
+
+ public final void releaseBuffer(ByteBuffer buffer) {
+ this.in.releaseBuffer(buffer);
+ }
+}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZlibCodec.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZlibCodec.java
deleted file mode 100644
index d0a8fa7da3..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZlibCodec.java
+++ /dev/null
@@ -1,169 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tajo.storage.thirdparty.orc;
-
-import org.apache.hadoop.hive.shims.HadoopShims.DirectCompressionType;
-import org.apache.hadoop.hive.shims.HadoopShims.DirectDecompressorShim;
-import org.apache.hadoop.hive.shims.ShimLoader;
-
-import javax.annotation.Nullable;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.EnumSet;
-import java.util.zip.DataFormatException;
-import java.util.zip.Deflater;
-import java.util.zip.Inflater;
-
-class ZlibCodec implements CompressionCodec, DirectDecompressionCodec {
-
- private Boolean direct = null;
-
- private final int level;
- private final int strategy;
-
- public ZlibCodec() {
- level = Deflater.DEFAULT_COMPRESSION;
- strategy = Deflater.DEFAULT_STRATEGY;
- }
-
- private ZlibCodec(int level, int strategy) {
- this.level = level;
- this.strategy = strategy;
- }
-
- @Override
- public boolean compress(ByteBuffer in, ByteBuffer out,
- ByteBuffer overflow) throws IOException {
- Deflater deflater = new Deflater(level, true);
- deflater.setStrategy(strategy);
- int length = in.remaining();
- deflater.setInput(in.array(), in.arrayOffset() + in.position(), length);
- deflater.finish();
- int outSize = 0;
- int offset = out.arrayOffset() + out.position();
- while (!deflater.finished() && (length > outSize)) {
- int size = deflater.deflate(out.array(), offset, out.remaining());
- out.position(size + out.position());
- outSize += size;
- offset += size;
- // if we run out of space in the out buffer, use the overflow
- if (out.remaining() == 0) {
- if (overflow == null) {
- deflater.end();
- return false;
- }
- out = overflow;
- offset = out.arrayOffset() + out.position();
- }
- }
- deflater.end();
- return length > outSize;
- }
-
- @Override
- public void decompress(ByteBuffer in, ByteBuffer out) throws IOException {
-
- if(in.isDirect() && out.isDirect()) {
- directDecompress(in, out);
- return;
- }
-
- Inflater inflater = new Inflater(true);
- inflater.setInput(in.array(), in.arrayOffset() + in.position(),
- in.remaining());
- while (!(inflater.finished() || inflater.needsDictionary() ||
- inflater.needsInput())) {
- try {
- int count = inflater.inflate(out.array(),
- out.arrayOffset() + out.position(),
- out.remaining());
- out.position(count + out.position());
- } catch (DataFormatException dfe) {
- throw new IOException("Bad compression data", dfe);
- }
- }
- out.flip();
- inflater.end();
- in.position(in.limit());
- }
-
- @Override
- public boolean isAvailable() {
- if (direct == null) {
- // see nowrap option in new Inflater(boolean) which disables zlib headers
- try {
- if (ShimLoader.getHadoopShims().getDirectDecompressor(
- DirectCompressionType.ZLIB_NOHEADER) != null) {
- direct = Boolean.valueOf(true);
- } else {
- direct = Boolean.valueOf(false);
- }
- } catch (UnsatisfiedLinkError ule) {
- direct = Boolean.valueOf(false);
- }
- }
- return direct.booleanValue();
- }
-
- @Override
- public void directDecompress(ByteBuffer in, ByteBuffer out)
- throws IOException {
- DirectDecompressorShim decompressShim = ShimLoader.getHadoopShims()
- .getDirectDecompressor(DirectCompressionType.ZLIB_NOHEADER);
- decompressShim.decompress(in, out);
- out.flip(); // flip for read
- }
-
- @Override
- public CompressionCodec modify(@Nullable EnumSet modifiers) {
-
- if (modifiers == null) {
- return this;
- }
-
- int l = this.level;
- int s = this.strategy;
-
- for (Modifier m : modifiers) {
- switch (m) {
- case BINARY:
- /* filtered == less LZ77, more huffman */
- s = Deflater.FILTERED;
- break;
- case TEXT:
- s = Deflater.DEFAULT_STRATEGY;
- break;
- case FASTEST:
- // deflate_fast looking for 8 byte patterns
- l = Deflater.BEST_SPEED;
- break;
- case FAST:
- // deflate_fast looking for 16 byte patterns
- l = Deflater.BEST_SPEED + 1;
- break;
- case DEFAULT:
- // deflate_slow looking for 128 byte patterns
- l = Deflater.DEFAULT_COMPRESSION;
- break;
- default:
- break;
- }
- }
- return new ZlibCodec(l, s);
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto b/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto
deleted file mode 100644
index c80cf6c269..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto
+++ /dev/null
@@ -1,217 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.thirdparty.orc;
-
-message IntegerStatistics {
- optional sint64 minimum = 1;
- optional sint64 maximum = 2;
- optional sint64 sum = 3;
-}
-
-message DoubleStatistics {
- optional double minimum = 1;
- optional double maximum = 2;
- optional double sum = 3;
-}
-
-message StringStatistics {
- optional string minimum = 1;
- optional string maximum = 2;
- // sum will store the total length of all strings in a stripe
- optional sint64 sum = 3;
-}
-
-message BucketStatistics {
- repeated uint64 count = 1 [packed=true];
-}
-
-message DecimalStatistics {
- optional string minimum = 1;
- optional string maximum = 2;
- optional string sum = 3;
-}
-
-message DateStatistics {
- // min,max values saved as days since epoch
- optional sint32 minimum = 1;
- optional sint32 maximum = 2;
-}
-
-message TimestampStatistics {
- // min,max values saved as milliseconds since epoch
- optional sint64 minimum = 1;
- optional sint64 maximum = 2;
-}
-
-message BinaryStatistics {
- // sum will store the total binary blob length in a stripe
- optional sint64 sum = 1;
-}
-
-message ColumnStatistics {
- optional uint64 numberOfValues = 1;
- optional IntegerStatistics intStatistics = 2;
- optional DoubleStatistics doubleStatistics = 3;
- optional StringStatistics stringStatistics = 4;
- optional BucketStatistics bucketStatistics = 5;
- optional DecimalStatistics decimalStatistics = 6;
- optional DateStatistics dateStatistics = 7;
- optional BinaryStatistics binaryStatistics = 8;
- optional TimestampStatistics timestampStatistics = 9;
- optional bool hasNull = 10;
-}
-
-message RowIndexEntry {
- repeated uint64 positions = 1 [packed=true];
- optional ColumnStatistics statistics = 2;
-}
-
-message RowIndex {
- repeated RowIndexEntry entry = 1;
-}
-
-message BloomFilter {
- optional uint32 numHashFunctions = 1;
- repeated fixed64 bitset = 2;
-}
-
-message BloomFilterIndex {
- repeated BloomFilter bloomFilter = 1;
-}
-
-message Stream {
- // if you add new index stream kinds, you need to make sure to update
- // StreamName to ensure it is added to the stripe in the right area
- enum Kind {
- PRESENT = 0;
- DATA = 1;
- LENGTH = 2;
- DICTIONARY_DATA = 3;
- DICTIONARY_COUNT = 4;
- SECONDARY = 5;
- ROW_INDEX = 6;
- BLOOM_FILTER = 7;
- }
- optional Kind kind = 1;
- optional uint32 column = 2;
- optional uint64 length = 3;
-}
-
-message ColumnEncoding {
- enum Kind {
- DIRECT = 0;
- DICTIONARY = 1;
- DIRECT_V2 = 2;
- DICTIONARY_V2 = 3;
- }
- optional Kind kind = 1;
- optional uint32 dictionarySize = 2;
-}
-
-message StripeFooter {
- repeated Stream streams = 1;
- repeated ColumnEncoding columns = 2;
- optional string writerTimezone = 3;
-}
-
-message Type {
- enum Kind {
- BOOLEAN = 0;
- BYTE = 1;
- SHORT = 2;
- INT = 3;
- LONG = 4;
- FLOAT = 5;
- DOUBLE = 6;
- STRING = 7;
- BINARY = 8;
- TIMESTAMP = 9;
- LIST = 10;
- MAP = 11;
- STRUCT = 12;
- UNION = 13;
- DECIMAL = 14;
- DATE = 15;
- VARCHAR = 16;
- CHAR = 17;
- }
- optional Kind kind = 1;
- repeated uint32 subtypes = 2 [packed=true];
- repeated string fieldNames = 3;
- optional uint32 maximumLength = 4;
- optional uint32 precision = 5;
- optional uint32 scale = 6;
-}
-
-message StripeInformation {
- optional uint64 offset = 1;
- optional uint64 indexLength = 2;
- optional uint64 dataLength = 3;
- optional uint64 footerLength = 4;
- optional uint64 numberOfRows = 5;
-}
-
-message UserMetadataItem {
- optional string name = 1;
- optional bytes value = 2;
-}
-
-message StripeStatistics {
- repeated ColumnStatistics colStats = 1;
-}
-
-message Metadata {
- repeated StripeStatistics stripeStats = 1;
-}
-
-message Footer {
- optional uint64 headerLength = 1;
- optional uint64 contentLength = 2;
- repeated StripeInformation stripes = 3;
- repeated Type types = 4;
- repeated UserMetadataItem metadata = 5;
- optional uint64 numberOfRows = 6;
- repeated ColumnStatistics statistics = 7;
- optional uint32 rowIndexStride = 8;
-}
-
-enum CompressionKind {
- NONE = 0;
- ZLIB = 1;
- SNAPPY = 2;
- LZO = 3;
-}
-
-// Serialized length must be less that 255 bytes
-message PostScript {
- optional uint64 footerLength = 1;
- optional CompressionKind compression = 2;
- optional uint64 compressionBlockSize = 3;
- // the version of the file format
- // [0, 11] = Hive 0.11
- // [0, 12] = Hive 0.12
- repeated uint32 version = 4 [packed = true];
- optional uint64 metadataLength = 5;
- // Version of the writer:
- // 0 (or missing) = original
- // 1 = HIVE-8732 fixed
- optional uint32 writerVersion = 6;
- // Leave this last in the record
- optional string magic = 8000;
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java
index b63b497d5b..608d066913 100644
--- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java
+++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java
@@ -27,6 +27,7 @@
import org.apache.hadoop.io.compress.*;
import org.apache.hadoop.io.compress.zlib.ZlibFactory;
import org.apache.hadoop.util.NativeCodeLoader;
+import org.apache.orc.OrcConf;
import org.apache.tajo.BuiltinStorages;
import org.apache.tajo.catalog.CatalogUtil;
import org.apache.tajo.catalog.Schema;
@@ -61,6 +62,7 @@ public class TestCompressionStorages {
public TestCompressionStorages(String type) throws IOException {
this.dataFormat = type;
conf = new TajoConf();
+ conf.setBoolean("hive.exec.orc.zerocopy", true);
testDir = CommonTestingUtil.getTestDir(TEST_PATH);
fs = testDir.getFileSystem(conf);
@@ -71,7 +73,8 @@ public static Collection