From 8b6f8c27285bffaf49bc3f86d57720e1f68aceda Mon Sep 17 00:00:00 2001
From: Jihoon Son
Date: Fri, 18 Mar 2016 18:12:19 +0900
Subject: [PATCH 01/16] TAJO-2102
---
.../tajo/catalog/store/HiveCatalogStore.java | 1 +
.../catalog/store/TestHiveCatalogStore.java | 1 +
.../org/apache/tajo/cli/tools/TajoDump.java | 2 +-
.../org/apache/tajo/datum/TimestampDatum.java | 2 +-
.../java/org/apache/tajo/unit/TimeUnit.java | 2 +
tajo-project/pom.xml | 2 +-
tajo-storage/tajo-storage-hdfs/pom.xml | 52 +
.../apache/tajo/storage/orc/ORCScanner.java | 332 ----
.../apache/tajo/storage/orc/OrcScanner.java | 397 +++++
.../ObjectInspectorFactory.java | 2 +-
.../thirdparty/orc/MetadataReader.java | 128 ++
.../tajo/storage/thirdparty/orc/OrcFile.java | 2 +-
.../thirdparty/orc/OrcRecordReader.java | 460 +++++
.../tajo/storage/thirdparty/orc/OrcUtils.java | 35 +
.../thirdparty/orc/RecordReaderUtils.java | 479 +++++
.../thirdparty/orc/TreeReaderFactory.java | 1576 +++++++++++++++++
.../storage/thirdparty/orc/WriterImpl.java | 6 +-
.../tajo/storage/TestCompressionStorages.java | 13 +-
.../org/apache/tajo/storage/TestStorages.java | 69 +-
.../src/test/resources/storage-default.xml | 2 +-
20 files changed, 3196 insertions(+), 367 deletions(-)
delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java
create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java
create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MetadataReader.java
create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java
create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RecordReaderUtils.java
create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java
diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java
index 63f18b6f75..1d0d261d12 100644
--- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java
+++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java
@@ -44,6 +44,7 @@
import org.apache.tajo.algebra.IsNullPredicate;
import org.apache.tajo.algebra.JsonHelper;
import org.apache.tajo.catalog.*;
+import org.apache.tajo.catalog.TableMeta;
import org.apache.tajo.catalog.partition.PartitionMethodDesc;
import org.apache.tajo.catalog.proto.CatalogProtos;
import org.apache.tajo.catalog.proto.CatalogProtos.*;
diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java
index 7e1a3a4ff6..6bb66a1a46 100644
--- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java
+++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java
@@ -78,6 +78,7 @@ public static void setUp() throws Exception {
conf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, warehousePath.toUri().toString());
conf.set(HiveConf.ConfVars.METASTORECONNECTURLKEY.varname, jdbcUri);
conf.set(TajoConf.ConfVars.WAREHOUSE_DIR.varname, warehousePath.toUri().toString());
+ conf.setBoolean("datanucleus.schema.autoCreateAll", true); // TODO: check this is valid
// create local HiveCatalogStore.
TajoConf tajoConf = new TajoConf(conf);
diff --git a/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoDump.java b/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoDump.java
index 4df418f5be..c9fa2b488c 100644
--- a/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoDump.java
+++ b/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoDump.java
@@ -208,7 +208,7 @@ private static void dumpDatabase(TajoClient client, String databaseName, PrintWr
}
}
writer.write("\n\n");
- } catch (Exception e) {
+ } catch (Throwable e) {
// dump for each table can throw any exception. We need to skip the exception case.
// here, the error message prints out via stderr.
System.err.println("ERROR:" + tableName + "," + e.getMessage());
diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java
index 5b4c152a51..f69e7da2f4 100644
--- a/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java
+++ b/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java
@@ -125,7 +125,7 @@ public String toString() {
/**
*
- * @param tm TimeMEta
+ * @param tm TimeMeta
* @param timeZone Timezone
* @param includeTimeZone Add timezone if it is true. It is usually used for TIMEZONEZ
* @return A timestamp string
diff --git a/tajo-common/src/main/java/org/apache/tajo/unit/TimeUnit.java b/tajo-common/src/main/java/org/apache/tajo/unit/TimeUnit.java
index 8062f2de5a..a03a930d78 100644
--- a/tajo-common/src/main/java/org/apache/tajo/unit/TimeUnit.java
+++ b/tajo-common/src/main/java/org/apache/tajo/unit/TimeUnit.java
@@ -26,4 +26,6 @@ public class TimeUnit {
public static final int DAY = HOUR * 24;
public static final int PART_UNIT = 5*TimeUnit.MIN;
+
+ public static final int MILLIS_PER_SECOND = 1000;
}
diff --git a/tajo-project/pom.xml b/tajo-project/pom.xml
index cd86d3b350..16e1eb074f 100644
--- a/tajo-project/pom.xml
+++ b/tajo-project/pom.xml
@@ -36,7 +36,7 @@
2.7.22.5.01.1.1
- 1.1.0
+ 2.0.04.0.34.Final2.66.1.26
diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml
index 5f66395e94..2c4538a6ba 100644
--- a/tajo-storage/tajo-storage-hdfs/pom.xml
+++ b/tajo-storage/tajo-storage-hdfs/pom.xml
@@ -349,6 +349,58 @@
presto-orc0.141
+
+ org.apache.hive
+ hive-orc
+ ${hive.version}
+
+
+ org.apache.hive
+ hive-serde
+ ${hive.version}
+
+
+ log4j-slf4j-impl
+ org.apache.logging.log4j
+
+
+ log4j-1.2-api
+ org.apache.logging.log4j
+
+
+
+
+ org.apache.hive
+ hive-exec
+ ${hive.version}
+
+
+ log4j-1.2-api
+ org.apache.logging.log4j
+
+
+ log4j-slf4j-impl
+ org.apache.logging.log4j
+
+
+ antlr-runtime
+ org.antlr
+
+
+ jline
+ jline
+
+
+ calcite-core
+ org.apache.calcite
+
+
+ calcite-avatica
+ org.apache.calcite
+
+
+
+
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java
deleted file mode 100644
index 0a4ebc6948..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java
+++ /dev/null
@@ -1,332 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc;
-
-import com.facebook.presto.orc.OrcDataSource;
-import com.facebook.presto.orc.OrcPredicate;
-import com.facebook.presto.orc.OrcReader;
-import com.facebook.presto.orc.OrcRecordReader;
-import com.facebook.presto.orc.memory.AggregatedMemoryContext;
-import com.facebook.presto.orc.metadata.OrcMetadataReader;
-import com.facebook.presto.spi.block.Block;
-import com.facebook.presto.spi.type.*;
-import com.google.protobuf.InvalidProtocolBufferException;
-import io.airlift.units.DataSize;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.tajo.TajoConstants;
-import org.apache.tajo.catalog.Schema;
-import org.apache.tajo.catalog.TableMeta;
-import org.apache.tajo.common.TajoDataTypes;
-import org.apache.tajo.conf.TajoConf;
-import org.apache.tajo.datum.*;
-import org.apache.tajo.exception.NotImplementedException;
-import org.apache.tajo.exception.TajoRuntimeException;
-import org.apache.tajo.plan.expr.EvalNode;
-import org.apache.tajo.storage.FileScanner;
-import org.apache.tajo.storage.StorageConstants;
-import org.apache.tajo.storage.Tuple;
-import org.apache.tajo.storage.VTuple;
-import org.apache.tajo.storage.fragment.Fragment;
-import org.apache.tajo.storage.thirdparty.orc.HdfsOrcDataSource;
-import org.apache.tajo.util.datetime.DateTimeUtil;
-import org.joda.time.DateTimeZone;
-
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.TimeZone;
-
-/**
- * OrcScanner for reading ORC files
- */
-public class ORCScanner extends FileScanner {
- private static final Log LOG = LogFactory.getLog(ORCScanner.class);
- private OrcRecordReader recordReader;
- private Block[] blocks;
- private int currentPosInBatch = 0;
- private int batchSize = 0;
- private Tuple outTuple;
- private AggregatedMemoryContext aggrMemoryContext = new AggregatedMemoryContext();
-
- public ORCScanner(Configuration conf, final Schema schema, final TableMeta meta, final Fragment fragment) {
- super(conf, schema, meta, fragment);
- }
-
- private FileSystem fs;
- private FSDataInputStream fis;
-
- private static class ColumnInfo {
- TajoDataTypes.DataType type;
- int id;
- }
-
- /**
- * Temporary array for caching column info
- */
- private ColumnInfo [] targetColInfo;
-
- @Override
- public void init() throws IOException {
- OrcReader orcReader;
- DataSize maxMergeDistance = new DataSize(Double.parseDouble(meta.getProperty(StorageConstants.ORC_MAX_MERGE_DISTANCE,
- StorageConstants.DEFAULT_ORC_MAX_MERGE_DISTANCE)), DataSize.Unit.BYTE);
- DataSize maxReadSize = new DataSize(Double.parseDouble(meta.getProperty(StorageConstants.ORC_MAX_READ_BUFFER_SIZE,
- StorageConstants.DEFAULT_ORC_MAX_READ_BUFFER_SIZE)), DataSize.Unit.BYTE);
-
- if (targets == null) {
- targets = schema.toArray();
- }
-
- outTuple = new VTuple(targets.length);
-
- Path path = fragment.getPath();
-
- if(fs == null) {
- fs = FileScanner.getFileSystem((TajoConf)conf, path);
- }
-
- if(fis == null) {
- fis = fs.open(path);
- }
-
- OrcDataSource orcDataSource = new HdfsOrcDataSource(
- this.fragment.getPath().toString(),
- fis,
- fs.getFileStatus(path).getLen(),
- maxMergeDistance,
- maxReadSize);
-
- targetColInfo = new ColumnInfo[targets.length];
- for (int i=0; i columnMap = new HashMap<>();
- for (ColumnInfo colInfo: targetColInfo) {
- columnMap.put(colInfo.id, createFBtypeByTajoType(colInfo.type));
- }
-
- orcReader = new OrcReader(orcDataSource, new OrcMetadataReader(), maxMergeDistance, maxReadSize);
-
- TimeZone timezone = TimeZone.getTimeZone(meta.getProperty(StorageConstants.TIMEZONE,
- TajoConstants.DEFAULT_SYSTEM_TIMEZONE));
-
- // TODO: make OrcPredicate useful
- // presto-orc uses joda timezone, so it needs to be converted.
- recordReader = orcReader.createRecordReader(columnMap, OrcPredicate.TRUE,
- fragment.getStartKey(), fragment.getLength(), DateTimeZone.forTimeZone(timezone), aggrMemoryContext);
-
- super.init();
- LOG.debug("file fragment { path: " + fragment.getPath() +
- ", start offset: " + fragment.getStartKey() +
- ", length: " + fragment.getLength() + "}");
- }
-
- @Override
- public Tuple next() throws IOException {
- if (currentPosInBatch == batchSize) {
- getNextBatch();
-
- // EOF
- if (batchSize == -1) {
- return null;
- }
- }
-
- for (int i=0; i stripeStats;
+ private int metadataSize;
+ protected List types;
+ private List userMetadata;
+ private List fileStats;
+ private List stripes;
+ protected int rowIndexStride;
+ private long contentLength, numberOfRows;
+
+ private List versionList;
+
+ //serialized footer - Keeping this around for use by getFileMetaInfo()
+ // will help avoid cpu cycles spend in deserializing at cost of increased
+ // memory footprint.
+ private ByteBuffer footerByteBuffer;
+ // Same for metastore cache - maintains the same background buffer, but includes postscript.
+ // This will only be set if the file footer/metadata was read from disk.
+ private ByteBuffer footerMetaAndPsBuffer;
+
+ private OrcRecordReader recordReader;
+
+ private long recordCount = 0;
+
+ /**
+ * Ensure this is an ORC file to prevent users from trying to read text
+ * files or RC files as ORC files.
+ * @param in the file being read
+ * @param path the filename for error messages
+ * @param psLen the postscript length
+ * @param buffer the tail of the file
+ * @throws IOException
+ */
+ static void ensureOrcFooter(FSDataInputStream in,
+ Path path,
+ int psLen,
+ ByteBuffer buffer) throws IOException {
+ int len = OrcFile.MAGIC.length();
+ if (psLen < len + 1) {
+ throw new IOException("Malformed ORC file " + path +
+ ". Invalid postscript length " + psLen);
+ }
+ int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - 1 - len;
+ byte[] array = buffer.array();
+ // now look for the magic string at the end of the postscript.
+ if (!Text.decode(array, offset, len).equals(OrcFile.MAGIC)) {
+ // If it isn't there, this may be the 0.11.0 version of ORC.
+ // Read the first 3 bytes of the file to check for the header
+ byte[] header = new byte[len];
+ in.readFully(0, header, 0, len);
+ // if it isn't there, this isn't an ORC file
+ if (!Text.decode(header, 0 , len).equals(OrcFile.MAGIC)) {
+ throw new IOException("Malformed ORC file " + path +
+ ". Invalid postscript.");
+ }
+ }
+ }
+
+ /**
+ * Build a version string out of an array.
+ * @param version the version number as a list
+ * @return the human readable form of the version string
+ */
+ private static String versionString(List version) {
+ StringBuilder buffer = new StringBuilder();
+ for(int i=0; i < version.size(); ++i) {
+ if (i != 0) {
+ buffer.append('.');
+ }
+ buffer.append(version.get(i));
+ }
+ return buffer.toString();
+ }
+
+ /**
+ * Check to see if this ORC file is from a future version and if so,
+ * warn the user that we may not be able to read all of the column encodings.
+ * @param log the logger to write any error message to
+ * @param path the data source path for error messages
+ * @param version the version of hive that wrote the file.
+ */
+ static void checkOrcVersion(Log log, Path path, List version) {
+ if (version.size() >= 1) {
+ int major = version.get(0);
+ int minor = 0;
+ if (version.size() >= 2) {
+ minor = version.get(1);
+ }
+ if (major > OrcFile.Version.CURRENT.getMajor() ||
+ (major == OrcFile.Version.CURRENT.getMajor() &&
+ minor > OrcFile.Version.CURRENT.getMinor())) {
+ log.warn(path + " was written by a future Hive version " +
+ versionString(version) +
+ ". This file may not be readable by this version of Hive.");
+ }
+ }
+ }
+
+ public OrcScanner(Configuration conf, Schema schema, TableMeta meta, Fragment fragment) throws IOException {
+ super(conf, schema, meta, fragment);
+
+ this.path = this.fragment.getPath();
+ this.fileSystem = this.path.getFileSystem(conf);
+ }
+
+ private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs,
+ Path path,
+ long maxFileLength
+ ) throws IOException {
+ FSDataInputStream file = fs.open(path);
+
+ // figure out the size of the file using the option or filesystem
+ long size;
+ if (maxFileLength == Long.MAX_VALUE) {
+ size = fs.getFileStatus(path).getLen();
+ } else {
+ size = maxFileLength;
+ }
+
+ //read last bytes into buffer to get PostScript
+ int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS);
+ ByteBuffer buffer = ByteBuffer.allocate(readSize);
+ assert buffer.position() == 0;
+ file.readFully((size - readSize),
+ buffer.array(), buffer.arrayOffset(), readSize);
+ buffer.position(0);
+
+ //read the PostScript
+ //get length of PostScript
+ int psLen = buffer.get(readSize - 1) & 0xff;
+ ensureOrcFooter(file, path, psLen, buffer);
+ int psOffset = readSize - 1 - psLen;
+ OrcProto.PostScript ps = extractPostScript(buffer, path, psLen, psOffset);
+
+ int footerSize = (int) ps.getFooterLength();
+ int metadataSize = (int) ps.getMetadataLength();
+
+ //check if extra bytes need to be read
+ ByteBuffer fullFooterBuffer = null;
+ int extra = Math.max(0, psLen + 1 + footerSize + metadataSize - readSize);
+ if (extra > 0) {
+ //more bytes need to be read, seek back to the right place and read extra bytes
+ ByteBuffer extraBuf = ByteBuffer.allocate(extra + readSize);
+ file.readFully((size - readSize - extra), extraBuf.array(),
+ extraBuf.arrayOffset() + extraBuf.position(), extra);
+ extraBuf.position(extra);
+ //append with already read bytes
+ extraBuf.put(buffer);
+ buffer = extraBuf;
+ buffer.position(0);
+ fullFooterBuffer = buffer.slice();
+ buffer.limit(footerSize + metadataSize);
+ } else {
+ //footer is already in the bytes in buffer, just adjust position, length
+ buffer.position(psOffset - footerSize - metadataSize);
+ fullFooterBuffer = buffer.slice();
+ buffer.limit(psOffset);
+ }
+
+ // remember position for later
+ buffer.mark();
+
+ file.close();
+
+ return new FileMetaInfo(
+ ps.getCompression().toString(),
+ (int) ps.getCompressionBlockSize(),
+ (int) ps.getMetadataLength(),
+ buffer,
+ ps.getVersionList(),
+ org.apache.orc.OrcFile.WriterVersion.FUTURE,
+ fullFooterBuffer
+ );
+ }
+
+ public OrcRecordReader getRecordReader() throws IOException {
+ boolean skipCorruptRecords = conf.getBoolean("orc.skip.corrupt-records", false);
+
+ return new OrcRecordReader(meta, this.stripes, fileSystem, schema, targets, fragment,
+ skipCorruptRecords, types, codec, bufferSize, rowIndexStride, conf);
+ }
+
+ @Override
+ public void init() throws IOException {
+ super.init();
+
+ FileMetaInfo footerMetaData = extractMetaInfoFromFooter(fileSystem, path, maxLength);
+ this.footerMetaAndPsBuffer = footerMetaData.footerMetaAndPsBuffer;
+ MetaInfoObjExtractor rInfo =
+ new MetaInfoObjExtractor(footerMetaData.compressionType,
+ footerMetaData.bufferSize,
+ footerMetaData.metadataSize,
+ footerMetaData.footerBuffer
+ );
+ this.footerByteBuffer = footerMetaData.footerBuffer;
+ this.compressionKind = rInfo.compressionKind;
+ this.codec = rInfo.codec;
+ this.bufferSize = rInfo.bufferSize;
+ this.metadataSize = rInfo.metadataSize;
+ this.stripeStats = rInfo.metadata.getStripeStatsList();
+ this.types = rInfo.footer.getTypesList();
+ this.rowIndexStride = rInfo.footer.getRowIndexStride();
+ this.contentLength = rInfo.footer.getContentLength();
+ this.numberOfRows = rInfo.footer.getNumberOfRows();
+ this.userMetadata = rInfo.footer.getMetadataList();
+ this.fileStats = rInfo.footer.getStatisticsList();
+ this.versionList = footerMetaData.versionList;
+ this.stripes = rInfo.footer.getStripesList();
+
+ recordReader = getRecordReader();
+ }
+
+ @Override
+ public Tuple next() throws IOException {
+ Tuple next = recordReader.next();
+ if (next != null) {
+ recordCount++;
+ }
+ return next;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ // TODO: improve this
+ this.close();
+ recordReader = getRecordReader();
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (recordReader != null) {
+ recordReader.close();
+ tableStats.setNumBytes(recordReader.getNumBytes());
+ tableStats.setNumRows(recordCount);
+ }
+ }
+
+ @Override
+ public boolean isProjectable() {
+ return true;
+ }
+
+ @Override
+ public boolean isSelectable() {
+ return false;
+ }
+
+ @Override
+ public void setFilter(EvalNode filter) {
+ // TODO: implement this
+ }
+
+ @Override
+ public float getProgress() {
+ return inited ? recordReader.getProgress() : super.getProgress();
+ }
+
+ @Override
+ public boolean isSplittable() {
+ return true;
+ }
+
+ private static OrcProto.PostScript extractPostScript(ByteBuffer bb, Path path,
+ int psLen, int psAbsOffset) throws IOException {
+ // TODO: when PB is upgraded to 2.6, newInstance(ByteBuffer) method should be used here.
+ assert bb.hasArray();
+ CodedInputStream in = CodedInputStream.newInstance(
+ bb.array(), bb.arrayOffset() + psAbsOffset, psLen);
+ OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(in);
+ checkOrcVersion(LOG, path, ps.getVersionList());
+
+ // Check compression codec.
+ switch (ps.getCompression()) {
+ case NONE:
+ break;
+ case ZLIB:
+ break;
+ case SNAPPY:
+ break;
+ case LZO:
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown compression");
+ }
+ return ps;
+ }
+
+ private static OrcProto.Footer extractFooter(ByteBuffer bb, int footerAbsPos,
+ int footerSize, CompressionCodec codec, int bufferSize) throws IOException {
+ bb.position(footerAbsPos);
+ bb.limit(footerAbsPos + footerSize);
+ return OrcProto.Footer.parseFrom(InStream.createCodedInputStream("footer",
+ Lists.newArrayList(new BufferChunk(bb, 0)), footerSize, codec, bufferSize));
+ }
+
+ private static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos,
+ int metadataSize, CompressionCodec codec, int bufferSize) throws IOException {
+ bb.position(metadataAbsPos);
+ bb.limit(metadataAbsPos + metadataSize);
+ return OrcProto.Metadata.parseFrom(InStream.createCodedInputStream("metadata",
+ Lists.newArrayList(new BufferChunk(bb, 0)), metadataSize, codec, bufferSize));
+ }
+
+ /**
+ * MetaInfoObjExtractor - has logic to create the values for the fields in ReaderImpl
+ * from serialized fields.
+ * As the fields are final, the fields need to be initialized in the constructor and
+ * can't be done in some helper function. So this helper class is used instead.
+ *
+ */
+ private static class MetaInfoObjExtractor{
+ final org.apache.orc.CompressionKind compressionKind;
+ final CompressionCodec codec;
+ final int bufferSize;
+ final int metadataSize;
+ final OrcProto.Metadata metadata;
+ final OrcProto.Footer footer;
+
+ MetaInfoObjExtractor(String codecStr, int bufferSize, int metadataSize,
+ ByteBuffer footerBuffer) throws IOException {
+
+ this.compressionKind = org.apache.orc.CompressionKind.valueOf(codecStr);
+ this.bufferSize = bufferSize;
+ this.codec = OrcUtils.createCodec(compressionKind);
+ this.metadataSize = metadataSize;
+
+ int position = footerBuffer.position();
+ int footerBufferSize = footerBuffer.limit() - footerBuffer.position() - metadataSize;
+
+ this.metadata = extractMetadata(footerBuffer, position, metadataSize, codec, bufferSize);
+ this.footer = extractFooter(
+ footerBuffer, position + metadataSize, footerBufferSize, codec, bufferSize);
+
+ footerBuffer.position(position);
+ }
+ }
+
+}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java
index 061ba0d034..4855ff9fe3 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java
@@ -83,7 +83,7 @@ public static ObjectInspector buildObjectInspectorByType(TajoDataTypes.Type data
break;
default:
- throw new UnsupportedException(dataType.name()+" is not supported yet in OrcAppender");
+ throw new UnsupportedException(dataType.name()+" is not supported yet in ORCAppender");
}
return oi;
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MetadataReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MetadataReader.java
new file mode 100644
index 0000000000..a3685a7240
--- /dev/null
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MetadataReader.java
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.thirdparty.orc;
+
+import com.google.common.collect.Lists;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.io.DiskRange;
+import org.apache.orc.CompressionCodec;
+import org.apache.orc.OrcProto;
+import org.apache.orc.StripeInformation;
+import org.apache.orc.impl.BufferChunk;
+import org.apache.orc.impl.InStream;
+import org.apache.orc.impl.OrcIndex;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+public class MetadataReader implements Closeable {
+
+ private final FSDataInputStream file;
+ private final CompressionCodec codec;
+ private final int bufferSize;
+ private final int typeCount;
+
+ public MetadataReader(FileSystem fileSystem, Path path,
+ CompressionCodec codec, int bufferSize, int typeCount) throws IOException {
+ this(fileSystem.open(path), codec, bufferSize, typeCount);
+ }
+
+ public MetadataReader(FSDataInputStream file,
+ CompressionCodec codec, int bufferSize, int typeCount) {
+ this.file = file;
+ this.codec = codec;
+ this.bufferSize = bufferSize;
+ this.typeCount = typeCount;
+ }
+
+ public OrcIndex readRowIndex(OrcProto.StripeInformation stripe,
+ OrcProto.StripeFooter footer, boolean[] included, OrcProto.RowIndex[] indexes,
+ boolean[] sargColumns, OrcProto.BloomFilterIndex[] bloomFilterIndices) throws IOException {
+ if (footer == null) {
+ footer = readStripeFooter(stripe);
+ }
+ if (indexes == null) {
+ indexes = new OrcProto.RowIndex[typeCount];
+ }
+ if (bloomFilterIndices == null) {
+ bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount];
+ }
+ long offset = stripe.getOffset();
+ List streams = footer.getStreamsList();
+ for (int i = 0; i < streams.size(); i++) {
+ OrcProto.Stream stream = streams.get(i);
+ OrcProto.Stream nextStream = null;
+ if (i < streams.size() - 1) {
+ nextStream = streams.get(i+1);
+ }
+ int col = stream.getColumn();
+ int len = (int) stream.getLength();
+ // row index stream and bloom filter are interlaced, check if the sarg column contains bloom
+ // filter and combine the io to read row index and bloom filters for that column together
+ if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.ROW_INDEX)) {
+ boolean readBloomFilter = false;
+ if (sargColumns != null && sargColumns[col] &&
+ nextStream.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER) {
+ len += nextStream.getLength();
+ i += 1;
+ readBloomFilter = true;
+ }
+ if ((included == null || included[col]) && indexes[col] == null) {
+ byte[] buffer = new byte[len];
+ file.readFully(offset, buffer, 0, buffer.length);
+ ByteBuffer bb = ByteBuffer.wrap(buffer);
+ indexes[col] = OrcProto.RowIndex.parseFrom(InStream.create("index",
+ Lists.newArrayList(new BufferChunk(bb, 0)), stream.getLength(),
+ codec, bufferSize));
+ if (readBloomFilter) {
+ bb.position((int) stream.getLength());
+ bloomFilterIndices[col] = OrcProto.BloomFilterIndex.parseFrom(InStream.create(
+ "bloom_filter", Lists.newArrayList(new BufferChunk(bb, 0)),
+ nextStream.getLength(), codec, bufferSize));
+ }
+ }
+ }
+ offset += len;
+ }
+
+ OrcIndex index = new OrcIndex(indexes, bloomFilterIndices);
+ return index;
+ }
+
+ public OrcProto.StripeFooter readStripeFooter(OrcProto.StripeInformation stripe) throws IOException {
+ long offset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength();
+ int tailLength = (int) stripe.getFooterLength();
+
+ // read the footer
+ ByteBuffer tailBuf = ByteBuffer.allocate(tailLength);
+ file.readFully(offset, tailBuf.array(), tailBuf.arrayOffset(), tailLength);
+ return OrcProto.StripeFooter.parseFrom(InStream.createCodedInputStream("footer",
+ Lists.newArrayList(new BufferChunk(tailBuf, 0)),
+ tailLength, codec, bufferSize));
+ }
+
+ @Override
+ public void close() throws IOException {
+ file.close();
+ }
+}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java
index a291953981..b3d9d30795 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java
@@ -52,7 +52,7 @@ public final class OrcFile {
*/
public static enum Version {
V_0_11("0.11", 0, 11),
- V_0_12("0.12", 0, 12);
+ V_0_12("0.12", 0, 12);
public static final Version CURRENT = V_0_12;
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java
new file mode 100644
index 0000000000..18a602bd34
--- /dev/null
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java
@@ -0,0 +1,460 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.thirdparty.orc;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.io.DiskRange;
+import org.apache.hadoop.hive.common.io.DiskRangeList;
+import org.apache.orc.CompressionCodec;
+import org.apache.orc.DataReader;
+import org.apache.orc.OrcProto;
+import org.apache.orc.impl.*;
+import org.apache.orc.impl.StreamName;
+import org.apache.tajo.catalog.Column;
+import org.apache.tajo.catalog.Schema;
+import org.apache.tajo.catalog.TableMeta;
+import org.apache.tajo.storage.Tuple;
+import org.apache.tajo.storage.VTuple;
+import org.apache.tajo.storage.fragment.FileFragment;
+import org.apache.tajo.storage.thirdparty.orc.TreeReaderFactory.DatumTreeReader;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class OrcRecordReader implements Closeable {
+
+ private final Log LOG = LogFactory.getLog(OrcRecordReader.class);
+
+ private final Path path;
+ private final long firstRow;
+ private final List stripes = new ArrayList<>();
+ private OrcProto.StripeFooter stripeFooter;
+ private final long totalRowCount;
+ private final CompressionCodec codec;
+ private final List types;
+ private final int bufferSize;
+ private final boolean[] included;
+ private final long rowIndexStride;
+ private long rowInStripe = 0;
+ private int currentStripe = -1;
+ private long rowBaseInStripe = 0;
+ private long rowCountInStripe = 0;
+ private final Map streams = new HashMap<>();
+ DiskRangeList bufferChunks = null;
+ private final TreeReaderFactory.DatumTreeReader[] reader;
+ private final OrcProto.RowIndex[] indexes;
+ private final OrcProto.BloomFilterIndex[] bloomFilterIndices;
+ private final Configuration conf;
+ private final org.apache.tajo.storage.thirdparty.orc.MetadataReader metadata;
+ private final DataReader dataReader;
+ private final Tuple result;
+
+ public OrcRecordReader(TableMeta meta,
+ List stripes,
+ FileSystem fileSystem,
+ Schema schema,
+ Column[] target,
+ FileFragment fragment,
+ boolean skipCorruptRecords,
+ List types,
+ CompressionCodec codec,
+ int bufferSize,
+ long strideRate,
+ Configuration conf
+ ) throws IOException {
+
+ result = new VTuple(target.length);
+
+ this.conf = conf;
+ this.path = fragment.getPath();
+ this.codec = codec;
+ this.types = types;
+ this.bufferSize = bufferSize;
+ this.included = new boolean[schema.size() + 1];
+ included[0] = target.length > 0; // always include root column except when target schema size is 0
+ Schema targetSchema = new Schema(target);
+ for (int i = 1; i < included.length; i++) {
+ included[i] = targetSchema.contains(schema.getColumn(i - 1));
+ }
+ this.rowIndexStride = strideRate;
+ this.metadata = new org.apache.tajo.storage.thirdparty.orc.MetadataReader(fileSystem, path, codec, bufferSize, types.size());
+
+ long rows = 0;
+ long skippedRows = 0;
+ long offset = fragment.getStartKey();
+ long maxOffset = fragment.getStartKey() + fragment.getLength();
+ for(OrcProto.StripeInformation stripe: stripes) {
+ long stripeStart = stripe.getOffset();
+ if (offset > stripeStart) {
+ skippedRows += stripe.getNumberOfRows();
+ } else if (stripeStart < maxOffset) {
+ this.stripes.add(stripe);
+ rows += stripe.getNumberOfRows();
+ }
+ }
+
+ // TODO: we could change the ctor to pass this externally
+ this.dataReader = RecordReaderUtils.createDefaultDataReader(fileSystem, path, true, codec);
+ this.dataReader.open();
+
+ firstRow = skippedRows;
+ totalRowCount = rows;
+ Boolean skipCorrupt = skipCorruptRecords;
+
+ reader = new DatumTreeReader[target.length];
+ for (int i = 0; i < reader.length; i++) {
+ reader[i] = TreeReaderFactory.createTreeReader(meta, schema.getColumnId(target[i].getQualifiedName()), target[i],
+ skipCorrupt);
+ }
+
+ indexes = new OrcProto.RowIndex[types.size()];
+ bloomFilterIndices = new OrcProto.BloomFilterIndex[types.size()];
+ advanceToNextRow(reader, 0L, true);
+ }
+
+ /**
+ * Plan the ranges of the file that we need to read given the list of
+ * columns and row groups.
+ *
+ * @param streamList the list of streams available
+ * @param includedColumns which columns are needed
+ * @param doMergeBuffers
+ * @return the list of disk ranges that will be loaded
+ */
+ static DiskRangeList planReadPartialDataStreams
+ (List streamList,
+ boolean[] includedColumns,
+ boolean doMergeBuffers) {
+ long offset = 0;
+ // figure out which columns have a present stream
+ DiskRangeList.CreateHelper list = new DiskRangeList.CreateHelper();
+ for (OrcProto.Stream stream : streamList) {
+ long length = stream.getLength();
+ int column = stream.getColumn();
+ OrcProto.Stream.Kind streamKind = stream.getKind();
+ // since stream kind is optional, first check if it exists
+ if (stream.hasKind() &&
+ (org.apache.orc.impl.StreamName.getArea(streamKind) == org.apache.orc.impl.StreamName.Area.DATA) &&
+ includedColumns[column]) {
+ RecordReaderUtils.addEntireStreamToRanges(offset, length, list, doMergeBuffers);
+ }
+ offset += length;
+ }
+ return list.extract();
+ }
+
+ void createStreams(List streamDescriptions,
+ DiskRangeList ranges,
+ boolean[] includeColumn,
+ CompressionCodec codec,
+ int bufferSize,
+ Map streams) throws IOException {
+ long streamOffset = 0;
+ for (OrcProto.Stream streamDesc : streamDescriptions) {
+ int column = streamDesc.getColumn();
+ if ((includeColumn != null && !includeColumn[column]) ||
+ streamDesc.hasKind() &&
+ (org.apache.orc.impl.StreamName.getArea(streamDesc.getKind()) != org.apache.orc.impl.StreamName.Area.DATA)) {
+ streamOffset += streamDesc.getLength();
+ continue;
+ }
+ List buffers = RecordReaderUtils.getStreamBuffers(
+ ranges, streamOffset, streamDesc.getLength());
+ org.apache.orc.impl.StreamName name = new StreamName(column, streamDesc.getKind());
+ streams.put(name, InStream.create(name.toString(), buffers,
+ streamDesc.getLength(), codec, bufferSize));
+ streamOffset += streamDesc.getLength();
+ }
+ }
+
+ private void readPartialDataStreams(OrcProto.StripeInformation stripe) throws IOException {
+ List streamList = stripeFooter.getStreamsList();
+ DiskRangeList toRead = planReadPartialDataStreams(streamList, included, true);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("chunks = " + RecordReaderUtils.stringifyDiskRanges(toRead));
+ }
+ bufferChunks = dataReader.readFileData(toRead, stripe.getOffset(), false);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("merge = " + RecordReaderUtils.stringifyDiskRanges(bufferChunks));
+ }
+
+ createStreams(streamList, bufferChunks, included, codec, bufferSize, streams);
+ }
+
+ /**
+ * Skip over rows that we aren't selecting, so that the next row is
+ * one that we will read.
+ *
+ * @param nextRow the row we want to go to
+ * @throws IOException
+ */
+ private boolean advanceToNextRow(
+ TreeReaderFactory.TreeReader[] reader, long nextRow, boolean canAdvanceStripe)
+ throws IOException {
+ long nextRowInStripe = nextRow - rowBaseInStripe;
+
+ if (nextRowInStripe >= rowCountInStripe) {
+ if (canAdvanceStripe) {
+ advanceStripe();
+ }
+ return canAdvanceStripe;
+ }
+ if (nextRowInStripe != rowInStripe) {
+ if (rowIndexStride != 0) {
+ int rowGroup = (int) (nextRowInStripe / rowIndexStride);
+ seekToRowEntry(reader, rowGroup);
+ for (TreeReaderFactory.TreeReader eachReader : reader) {
+ eachReader.skipRows(nextRowInStripe - rowGroup * rowIndexStride);
+ }
+ } else {
+ for (TreeReaderFactory.TreeReader eachReader : reader) {
+ eachReader.skipRows(nextRowInStripe - rowInStripe);
+ }
+ }
+ rowInStripe = nextRowInStripe;
+ }
+ return true;
+ }
+
+ public boolean hasNext() throws IOException {
+ return rowInStripe < rowCountInStripe;
+ }
+
+ public Tuple next() throws IOException {
+ if (hasNext()) {
+ try {
+ for (int i = 0; i < reader.length; i++) {
+ result.put(i, reader[i].next());
+ }
+ // find the next row
+ rowInStripe += 1;
+ advanceToNextRow(reader, rowInStripe + rowBaseInStripe, true);
+ return result;
+ } catch (IOException e) {
+ // Rethrow exception with file name in log message
+ throw new IOException("Error reading file: " + path, e);
+ }
+ } else {
+ return null;
+ }
+ }
+
+ /**
+ * Read the next stripe until we find a row that we don't skip.
+ *
+ * @throws IOException
+ */
+ private void advanceStripe() throws IOException {
+ rowInStripe = rowCountInStripe;
+ while (rowInStripe >= rowCountInStripe &&
+ currentStripe < stripes.size() - 1) {
+ currentStripe += 1;
+ readStripe();
+ }
+ }
+
+ /**
+ * Read the current stripe into memory.
+ *
+ * @throws IOException
+ */
+ private void readStripe() throws IOException {
+ OrcProto.StripeInformation stripe = beginReadStripe();
+
+ // if we haven't skipped the whole stripe, read the data
+ if (rowInStripe < rowCountInStripe) {
+ // if we aren't projecting columns or filtering rows, just read it all
+ if (included == null) {
+ readAllDataStreams(stripe);
+ } else {
+ readPartialDataStreams(stripe);
+ }
+
+ for (TreeReaderFactory.TreeReader eachReader : reader) {
+ eachReader.startStripe(streams, stripeFooter);
+ }
+ // if we skipped the first row group, move the pointers forward
+ if (rowInStripe != 0) {
+ seekToRowEntry(reader, (int) (rowInStripe / rowIndexStride));
+ }
+ }
+ }
+
+ private void clearStreams() throws IOException {
+ // explicit close of all streams to de-ref ByteBuffers
+ for (InStream is : streams.values()) {
+ is.close();
+ }
+ if (bufferChunks != null) {
+ if (dataReader.isTrackingDiskRanges()) {
+ for (DiskRangeList range = bufferChunks; range != null; range = range.next) {
+ if (!(range instanceof BufferChunk)) {
+ continue;
+ }
+ dataReader.releaseBuffer(((BufferChunk) range).getChunk());
+ }
+ }
+ }
+ bufferChunks = null;
+ streams.clear();
+ }
+
+ OrcProto.StripeFooter readStripeFooter(OrcProto.StripeInformation stripe) throws IOException {
+ return metadata.readStripeFooter(stripe);
+ }
+
+ private OrcProto.StripeInformation beginReadStripe() throws IOException {
+ OrcProto.StripeInformation stripe = stripes.get(currentStripe);
+ stripeFooter = readStripeFooter(stripe);
+ clearStreams();
+ // setup the position in the stripe
+ rowCountInStripe = stripe.getNumberOfRows();
+ rowInStripe = 0;
+ rowBaseInStripe = 0;
+ for (int i = 0; i < currentStripe; ++i) {
+ rowBaseInStripe += stripes.get(i).getNumberOfRows();
+ }
+ // reset all of the indexes
+ for (int i = 0; i < indexes.length; ++i) {
+ indexes[i] = null;
+ }
+ return stripe;
+ }
+
+ private void readAllDataStreams(OrcProto.StripeInformation stripe) throws IOException {
+ long start = stripe.getIndexLength();
+ long end = start + stripe.getDataLength();
+ // explicitly trigger 1 big read
+ DiskRangeList toRead = new DiskRangeList(start, end);
+ bufferChunks = dataReader.readFileData(toRead, stripe.getOffset(), false);
+ List streamDescriptions = stripeFooter.getStreamsList();
+ createStreams(streamDescriptions, bufferChunks, included, codec, bufferSize, streams);
+ }
+
+ public long getRowNumber() {
+ return rowInStripe + rowBaseInStripe + firstRow;
+ }
+
+ public float getProgress() {
+ return ((float) rowBaseInStripe + rowInStripe) / totalRowCount;
+ }
+
+ private int findStripe(long rowNumber) {
+ for (int i = 0; i < stripes.size(); i++) {
+ OrcProto.StripeInformation stripe = stripes.get(i);
+ if (stripe.getNumberOfRows() > rowNumber) {
+ return i;
+ }
+ rowNumber -= stripe.getNumberOfRows();
+ }
+ throw new IllegalArgumentException("Seek after the end of reader range");
+ }
+
+ OrcIndex readRowIndex(
+ int stripeIndex, boolean[] included) throws IOException {
+ return readRowIndex(stripeIndex, included, null, null);
+ }
+
+ OrcIndex readRowIndex(int stripeIndex, boolean[] included, OrcProto.RowIndex[] indexes,
+ OrcProto.BloomFilterIndex[] bloomFilterIndex) throws IOException {
+ OrcProto.StripeInformation stripe = stripes.get(stripeIndex);
+ OrcProto.StripeFooter stripeFooter = null;
+ // if this is the current stripe, use the cached objects.
+ if (stripeIndex == currentStripe) {
+ stripeFooter = this.stripeFooter;
+ indexes = indexes == null ? this.indexes : indexes;
+ bloomFilterIndex = bloomFilterIndex == null ? this.bloomFilterIndices : bloomFilterIndex;
+ }
+ return metadata.readRowIndex(stripe, stripeFooter, included, indexes, null,
+ bloomFilterIndex);
+ }
+
+ private void seekToRowEntry(TreeReaderFactory.TreeReader []reader, int rowEntry)
+ throws IOException {
+ PositionProvider[] index = new PositionProvider[indexes.length];
+ for (int i = 0; i < indexes.length; ++i) {
+ if (indexes[i] != null) {
+ index[i] = new PositionProviderImpl(indexes[i].getEntry(rowEntry));
+ }
+ }
+ for (TreeReaderFactory.TreeReader eachReader : reader) {
+ eachReader.seek(index);
+ }
+ }
+
+ public void seekToRow(long rowNumber) throws IOException {
+ if (rowNumber < 0) {
+ throw new IllegalArgumentException("Seek to a negative row number " +
+ rowNumber);
+ } else if (rowNumber < firstRow) {
+ throw new IllegalArgumentException("Seek before reader range " +
+ rowNumber);
+ }
+ // convert to our internal form (rows from the beginning of slice)
+ rowNumber -= firstRow;
+
+ // move to the right stripe
+ int rightStripe = findStripe(rowNumber);
+ if (rightStripe != currentStripe) {
+ currentStripe = rightStripe;
+ readStripe();
+ }
+ readRowIndex(currentStripe, included);
+
+ // if we aren't to the right row yet, advance in the stripe.
+ advanceToNextRow(reader, rowNumber, true);
+ }
+
+ public long getNumBytes() {
+ return ((RecordReaderUtils.DefaultDataReader)dataReader).getReadBytes();
+ }
+
+ @Override
+ public void close() throws IOException {
+ clearStreams();
+ dataReader.close();
+ }
+
+ public static final class PositionProviderImpl implements PositionProvider {
+ private final OrcProto.RowIndexEntry entry;
+ private int index;
+
+ public PositionProviderImpl(OrcProto.RowIndexEntry entry) {
+ this(entry, 0);
+ }
+
+ public PositionProviderImpl(OrcProto.RowIndexEntry entry, int startPos) {
+ this.entry = entry;
+ this.index = startPos;
+ }
+
+ @Override
+ public long getNext() {
+ return entry.getPositions(index++);
+ }
+ }
+}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java
index 3a474dd188..5c7fa458ee 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java
@@ -21,6 +21,10 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.serde2.objectinspector.*;
+import org.apache.orc.*;
+import org.apache.orc.CompressionCodec;
+import org.apache.orc.impl.*;
+import org.apache.orc.impl.SnappyCodec;
import java.util.Arrays;
import java.util.HashMap;
@@ -198,4 +202,35 @@ public static int getFlattenedColumnsCount(ObjectInspector inspector) {
return numWriters;
}
+ public static org.apache.orc.CompressionCodec createCodec(org.apache.orc.CompressionKind kind) {
+ switch (kind) {
+ case NONE:
+ return null;
+ case ZLIB:
+ return new org.apache.orc.impl.ZlibCodec();
+ case SNAPPY:
+ return new SnappyCodec();
+ case LZO:
+ try {
+ ClassLoader loader = Thread.currentThread().getContextClassLoader();
+ if (loader == null) {
+ throw new RuntimeException("error while getting a class loader");
+ }
+ @SuppressWarnings("unchecked")
+ Class extends org.apache.orc.CompressionCodec> lzo =
+ (Class extends CompressionCodec>)
+ loader.loadClass("org.apache.hadoop.hive.ql.io.orc.LzoCodec");
+ return lzo.newInstance();
+ } catch (ClassNotFoundException e) {
+ throw new IllegalArgumentException("LZO is not available.", e);
+ } catch (InstantiationException e) {
+ throw new IllegalArgumentException("Problem initializing LZO", e);
+ } catch (IllegalAccessException e) {
+ throw new IllegalArgumentException("Insufficient access to LZO", e);
+ }
+ default:
+ throw new IllegalArgumentException("Unknown compression codec: " +
+ kind);
+ }
+ }
}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RecordReaderUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RecordReaderUtils.java
new file mode 100644
index 0000000000..5253711664
--- /dev/null
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RecordReaderUtils.java
@@ -0,0 +1,479 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.thirdparty.orc;
+
+import com.google.common.collect.ComparisonChain;
+import org.apache.commons.lang.builder.HashCodeBuilder;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.io.DiskRange;
+import org.apache.hadoop.hive.common.io.DiskRangeList;
+import org.apache.hadoop.hive.shims.HadoopShims;
+import org.apache.hadoop.hive.shims.ShimLoader;
+import org.apache.orc.CompressionCodec;
+import org.apache.orc.DataReader;
+import org.apache.orc.OrcProto;
+import org.apache.orc.impl.BufferChunk;
+import org.apache.orc.impl.DirectDecompressionCodec;
+import org.apache.orc.impl.OutStream;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+public class RecordReaderUtils {
+
+ public static class DefaultDataReader implements DataReader {
+ private FSDataInputStream file;
+ private ByteBufferAllocatorPool pool;
+ private HadoopShims.ZeroCopyReaderShim zcr;
+ private FileSystem fs;
+ private Path path;
+ private boolean useZeroCopy;
+ private CompressionCodec codec;
+ private long readBytes = 0;
+
+ public DefaultDataReader(
+ FileSystem fs, Path path, boolean useZeroCopy, CompressionCodec codec) {
+ this.fs = fs;
+ this.path = path;
+ this.useZeroCopy = useZeroCopy;
+ this.codec = codec;
+ }
+
+ @Override
+ public void open() throws IOException {
+ this.file = fs.open(path);
+ if (useZeroCopy) {
+ pool = new ByteBufferAllocatorPool();
+ zcr = RecordReaderUtils.createZeroCopyShim(file, codec, pool);
+ } else {
+ pool = null;
+ zcr = null;
+ }
+ }
+
+ @Override
+ public DiskRangeList readFileData(
+ DiskRangeList range, long baseOffset, boolean doForceDirect) throws IOException {
+ return readDiskRanges(file, zcr, baseOffset, range, doForceDirect);
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (file != null) {
+ file.close();
+ }
+ if (pool != null) {
+ pool.clear();
+ }
+ }
+
+ @Override
+ public boolean isTrackingDiskRanges() {
+ return zcr != null;
+ }
+
+ @Override
+ public void releaseBuffer(ByteBuffer buffer) {
+ zcr.releaseBuffer(buffer);
+ }
+
+ public long getReadBytes() {
+ return readBytes;
+ }
+
+ /**
+ * Read the list of ranges from the file.
+ * @param file the file to read
+ * @param base the base of the stripe
+ * @param range the disk ranges within the stripe to read
+ * @return the bytes read for each disk range, which is the same length as
+ * ranges
+ * @throws IOException
+ */
+ private DiskRangeList readDiskRanges(FSDataInputStream file,
+ HadoopShims.ZeroCopyReaderShim zcr,
+ long base,
+ DiskRangeList range,
+ boolean doForceDirect) throws IOException {
+ if (range == null) return null;
+ DiskRangeList prev = range.prev;
+ if (prev == null) {
+ prev = new DiskRangeList.MutateHelper(range);
+ }
+ while (range != null) {
+ if (range.hasData()) {
+ range = range.next;
+ continue;
+ }
+ int len = (int) (range.getEnd() - range.getOffset());
+ long off = range.getOffset();
+ if (zcr != null) {
+ file.seek(base + off);
+ boolean hasReplaced = false;
+ while (len > 0) {
+ ByteBuffer partial = zcr.readBuffer(len, false);
+ readBytes += partial.remaining();
+ BufferChunk bc = new BufferChunk(partial, off);
+ if (!hasReplaced) {
+ range.replaceSelfWith(bc);
+ hasReplaced = true;
+ } else {
+ range.insertAfter(bc);
+ }
+ range = bc;
+ int read = partial.remaining();
+ len -= read;
+ off += read;
+ }
+ } else {
+ // Don't use HDFS ByteBuffer API because it has no readFully, and is buggy and pointless.
+ byte[] buffer = new byte[len];
+ file.readFully((base + off), buffer, 0, buffer.length);
+ readBytes += buffer.length;
+ ByteBuffer bb = null;
+ if (doForceDirect) {
+ bb = ByteBuffer.allocateDirect(len);
+ bb.put(buffer);
+ bb.position(0);
+ bb.limit(len);
+ } else {
+ bb = ByteBuffer.wrap(buffer);
+ }
+ range = range.replaceSelfWith(new BufferChunk(bb, range.getOffset()));
+ }
+ range = range.next;
+ }
+ return prev.next;
+ }
+ }
+
+ public static DataReader createDefaultDataReader(
+ FileSystem fs, Path path, boolean useZeroCopy, CompressionCodec codec) {
+ return new DefaultDataReader(fs, path, useZeroCopy, codec);
+ }
+
+ public static boolean[] findPresentStreamsByColumn(
+ List streamList, List types) {
+ boolean[] hasNull = new boolean[types.size()];
+ for(OrcProto.Stream stream: streamList) {
+ if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.PRESENT)) {
+ hasNull[stream.getColumn()] = true;
+ }
+ }
+ return hasNull;
+ }
+
+ /**
+ * Does region A overlap region B? The end points are inclusive on both sides.
+ * @param leftA A's left point
+ * @param rightA A's right point
+ * @param leftB B's left point
+ * @param rightB B's right point
+ * @return Does region A overlap region B?
+ */
+ static boolean overlap(long leftA, long rightA, long leftB, long rightB) {
+ if (leftA <= leftB) {
+ return rightA >= leftB;
+ }
+ return rightB >= leftA;
+ }
+
+ public static void addEntireStreamToRanges(
+ long offset, long length, DiskRangeList.CreateHelper list, boolean doMergeBuffers) {
+ list.addOrMerge(offset, offset + length, doMergeBuffers, false);
+ }
+
+ public static void addRgFilteredStreamToRanges(OrcProto.Stream stream,
+ boolean[] includedRowGroups, boolean isCompressed, OrcProto.RowIndex index,
+ OrcProto.ColumnEncoding encoding, OrcProto.Type type, int compressionSize, boolean hasNull,
+ long offset, long length, DiskRangeList.CreateHelper list, boolean doMergeBuffers) {
+ for (int group = 0; group < includedRowGroups.length; ++group) {
+ if (!includedRowGroups[group]) continue;
+ int posn = getIndexPosition(
+ encoding.getKind(), type.getKind(), stream.getKind(), isCompressed, hasNull);
+ long start = index.getEntry(group).getPositions(posn);
+ final long nextGroupOffset;
+ boolean isLast = group == (includedRowGroups.length - 1);
+ nextGroupOffset = isLast ? length : index.getEntry(group + 1).getPositions(posn);
+
+ start += offset;
+ long end = offset + estimateRgEndOffset(
+ isCompressed, isLast, nextGroupOffset, length, compressionSize);
+ list.addOrMerge(start, end, doMergeBuffers, true);
+ }
+ }
+
+ public static long estimateRgEndOffset(boolean isCompressed, boolean isLast,
+ long nextGroupOffset, long streamLength, int bufferSize) {
+ // figure out the worst case last location
+ // if adjacent groups have the same compressed block offset then stretch the slop
+ // by factor of 2 to safely accommodate the next compression block.
+ // One for the current compression block and another for the next compression block.
+ long slop = isCompressed ? 2 * (OutStream.HEADER_SIZE + bufferSize) : WORST_UNCOMPRESSED_SLOP;
+ return isLast ? streamLength : Math.min(streamLength, nextGroupOffset + slop);
+ }
+
+ private static final int BYTE_STREAM_POSITIONS = 1;
+ private static final int RUN_LENGTH_BYTE_POSITIONS = BYTE_STREAM_POSITIONS + 1;
+ private static final int BITFIELD_POSITIONS = RUN_LENGTH_BYTE_POSITIONS + 1;
+ private static final int RUN_LENGTH_INT_POSITIONS = BYTE_STREAM_POSITIONS + 1;
+
+ /**
+ * Get the offset in the index positions for the column that the given
+ * stream starts.
+ * @param columnEncoding the encoding of the column
+ * @param columnType the type of the column
+ * @param streamType the kind of the stream
+ * @param isCompressed is the file compressed
+ * @param hasNulls does the column have a PRESENT stream?
+ * @return the number of positions that will be used for that stream
+ */
+ public static int getIndexPosition(OrcProto.ColumnEncoding.Kind columnEncoding,
+ OrcProto.Type.Kind columnType,
+ OrcProto.Stream.Kind streamType,
+ boolean isCompressed,
+ boolean hasNulls) {
+ if (streamType == OrcProto.Stream.Kind.PRESENT) {
+ return 0;
+ }
+ int compressionValue = isCompressed ? 1 : 0;
+ int base = hasNulls ? (BITFIELD_POSITIONS + compressionValue) : 0;
+ switch (columnType) {
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case FLOAT:
+ case DOUBLE:
+ case DATE:
+ case STRUCT:
+ case MAP:
+ case LIST:
+ case UNION:
+ return base;
+ case CHAR:
+ case VARCHAR:
+ case STRING:
+ if (columnEncoding == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
+ columnEncoding == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
+ return base;
+ } else {
+ if (streamType == OrcProto.Stream.Kind.DATA) {
+ return base;
+ } else {
+ return base + BYTE_STREAM_POSITIONS + compressionValue;
+ }
+ }
+ case BINARY:
+ if (streamType == OrcProto.Stream.Kind.DATA) {
+ return base;
+ }
+ return base + BYTE_STREAM_POSITIONS + compressionValue;
+ case DECIMAL:
+ if (streamType == OrcProto.Stream.Kind.DATA) {
+ return base;
+ }
+ return base + BYTE_STREAM_POSITIONS + compressionValue;
+ case TIMESTAMP:
+ if (streamType == OrcProto.Stream.Kind.DATA) {
+ return base;
+ }
+ return base + RUN_LENGTH_INT_POSITIONS + compressionValue;
+ default:
+ throw new IllegalArgumentException("Unknown type " + columnType);
+ }
+ }
+
+ // for uncompressed streams, what is the most overlap with the following set
+ // of rows (long vint literal group).
+ static final int WORST_UNCOMPRESSED_SLOP = 2 + 8 * 512;
+
+ /**
+ * Is this stream part of a dictionary?
+ * @return is this part of a dictionary?
+ */
+ public static boolean isDictionary(OrcProto.Stream.Kind kind,
+ OrcProto.ColumnEncoding encoding) {
+ assert kind != OrcProto.Stream.Kind.DICTIONARY_COUNT;
+ OrcProto.ColumnEncoding.Kind encodingKind = encoding.getKind();
+ return kind == OrcProto.Stream.Kind.DICTIONARY_DATA ||
+ (kind == OrcProto.Stream.Kind.LENGTH &&
+ (encodingKind == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
+ encodingKind == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2));
+ }
+
+ /**
+ * Build a string representation of a list of disk ranges.
+ * @param range ranges to stringify
+ * @return the resulting string
+ */
+ public static String stringifyDiskRanges(DiskRangeList range) {
+ StringBuilder buffer = new StringBuilder();
+ buffer.append("[");
+ boolean isFirst = true;
+ while (range != null) {
+ if (!isFirst) {
+ buffer.append(", {");
+ } else {
+ buffer.append("{");
+ }
+ isFirst = false;
+ buffer.append(range.toString());
+ buffer.append("}");
+ range = range.next;
+ }
+ buffer.append("]");
+ return buffer.toString();
+ }
+
+ public static List getStreamBuffers(DiskRangeList range, long offset, long length) {
+ // This assumes sorted ranges (as do many other parts of ORC code.
+ ArrayList buffers = new ArrayList();
+ if (length == 0) return buffers;
+ long streamEnd = offset + length;
+ boolean inRange = false;
+ while (range != null) {
+ if (!inRange) {
+ if (range.getEnd() <= offset) {
+ range = range.next;
+ continue; // Skip until we are in range.
+ }
+ inRange = true;
+ if (range.getOffset() < offset) {
+ // Partial first buffer, add a slice of it.
+ buffers.add(range.sliceAndShift(offset, Math.min(streamEnd, range.getEnd()), -offset));
+ if (range.getEnd() >= streamEnd) break; // Partial first buffer is also partial last buffer.
+ range = range.next;
+ continue;
+ }
+ } else if (range.getOffset() >= streamEnd) {
+ break;
+ }
+ if (range.getEnd() > streamEnd) {
+ // Partial last buffer (may also be the first buffer), add a slice of it.
+ buffers.add(range.sliceAndShift(range.getOffset(), streamEnd, -offset));
+ break;
+ }
+ // Buffer that belongs entirely to one stream.
+ // TODO: ideally we would want to reuse the object and remove it from the list, but we cannot
+ // because bufferChunks is also used by clearStreams for zcr. Create a useless dup.
+ buffers.add(range.sliceAndShift(range.getOffset(), range.getEnd(), -offset));
+ if (range.getEnd() == streamEnd) break;
+ range = range.next;
+ }
+ return buffers;
+ }
+
+ static HadoopShims.ZeroCopyReaderShim createZeroCopyShim(FSDataInputStream file,
+ CompressionCodec codec, ByteBufferAllocatorPool pool) throws IOException {
+ if ((codec == null || ((codec instanceof DirectDecompressionCodec)
+ && ((DirectDecompressionCodec) codec).isAvailable()))) {
+ /* codec is null or is available */
+ return ShimLoader.getHadoopShims().getZeroCopyReader(file, pool);
+ }
+ return null;
+ }
+
+ // this is an implementation copied from ElasticByteBufferPool in hadoop-2,
+ // which lacks a clear()/clean() operation
+ public final static class ByteBufferAllocatorPool implements HadoopShims.ByteBufferPoolShim {
+ private static final class Key implements Comparable {
+ private final int capacity;
+ private final long insertionGeneration;
+
+ Key(int capacity, long insertionGeneration) {
+ this.capacity = capacity;
+ this.insertionGeneration = insertionGeneration;
+ }
+
+ @Override
+ public int compareTo(Key other) {
+ return ComparisonChain.start().compare(capacity, other.capacity)
+ .compare(insertionGeneration, other.insertionGeneration).result();
+ }
+
+ @Override
+ public boolean equals(Object rhs) {
+ if (rhs == null) {
+ return false;
+ }
+ try {
+ Key o = (Key) rhs;
+ return (compareTo(o) == 0);
+ } catch (ClassCastException e) {
+ return false;
+ }
+ }
+
+ @Override
+ public int hashCode() {
+ return new HashCodeBuilder().append(capacity).append(insertionGeneration)
+ .toHashCode();
+ }
+ }
+
+ private final TreeMap buffers = new TreeMap();
+
+ private final TreeMap directBuffers = new TreeMap();
+
+ private long currentGeneration = 0;
+
+ private final TreeMap getBufferTree(boolean direct) {
+ return direct ? directBuffers : buffers;
+ }
+
+ public void clear() {
+ buffers.clear();
+ directBuffers.clear();
+ }
+
+ @Override
+ public ByteBuffer getBuffer(boolean direct, int length) {
+ TreeMap tree = getBufferTree(direct);
+ Map.Entry entry = tree.ceilingEntry(new Key(length, 0));
+ if (entry == null) {
+ return direct ? ByteBuffer.allocateDirect(length) : ByteBuffer
+ .allocate(length);
+ }
+ tree.remove(entry.getKey());
+ return entry.getValue();
+ }
+
+ @Override
+ public void putBuffer(ByteBuffer buffer) {
+ TreeMap tree = getBufferTree(buffer.isDirect());
+ while (true) {
+ Key key = new Key(buffer.capacity(), currentGeneration++);
+ if (!tree.containsKey(key)) {
+ tree.put(key, buffer);
+ return;
+ }
+ // Buffers are indexed by (capacity, generation).
+ // If our key is not unique on the first try, we try again
+ }
+ }
+ }
+}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java
new file mode 100644
index 0000000000..c1781ef6a6
--- /dev/null
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java
@@ -0,0 +1,1576 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.thirdparty.orc;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.io.Text;
+import org.apache.orc.OrcProto;
+import org.apache.orc.impl.*;
+import org.apache.orc.impl.DynamicByteArray;
+import org.apache.orc.impl.SerializationUtils;
+import org.apache.orc.impl.StreamName;
+import org.apache.orc.impl.WriterImpl;
+import org.apache.tajo.TajoConstants;
+import org.apache.tajo.catalog.Column;
+import org.apache.tajo.catalog.TableMeta;
+import org.apache.tajo.catalog.TypeDesc;
+import org.apache.tajo.datum.Datum;
+import org.apache.tajo.datum.DatumFactory;
+import org.apache.tajo.datum.NullDatum;
+import org.apache.tajo.exception.TajoRuntimeException;
+import org.apache.tajo.exception.UnsupportedException;
+import org.apache.tajo.storage.StorageConstants;
+import org.apache.tajo.unit.TimeUnit;
+import org.apache.tajo.util.datetime.DateTimeUtil;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TimeZone;
+
+public class TreeReaderFactory {
+
+ private final static Log LOG = LogFactory.getLog(TreeReaderFactory.class);
+
+ public static class TreeReaderSchema {
+
+ /**
+ * The types in the ORC file.
+ */
+ List fileTypes;
+
+ /**
+ * The treeReaderSchema that the reader should read as.
+ */
+ List schemaTypes;
+
+ /**
+ * The subtype of the row STRUCT. Different than 0 for ACID.
+ */
+ int innerStructSubtype;
+
+ public TreeReaderSchema() {
+ fileTypes = null;
+ schemaTypes = null;
+ innerStructSubtype = -1;
+ }
+
+ public TreeReaderSchema fileTypes(List fileTypes) {
+ this.fileTypes = fileTypes;
+ return this;
+ }
+
+ public TreeReaderSchema schemaTypes(List schemaTypes) {
+ this.schemaTypes = schemaTypes;
+ return this;
+ }
+
+ public TreeReaderSchema innerStructSubtype(int innerStructSubtype) {
+ this.innerStructSubtype = innerStructSubtype;
+ return this;
+ }
+
+ public List getFileTypes() {
+ return fileTypes;
+ }
+
+ public List getSchemaTypes() {
+ return schemaTypes;
+ }
+
+ public int getInnerStructSubtype() {
+ return innerStructSubtype;
+ }
+ }
+
+ public abstract static class TreeReader {
+ protected final int columnId;
+ protected BitFieldReader present = null;
+ protected boolean valuePresent = false;
+
+ TreeReader(int columnId) throws IOException {
+ this(columnId, null);
+ }
+
+ protected TreeReader(int columnId, InStream in) throws IOException {
+ this.columnId = columnId;
+ if (in == null) {
+ present = null;
+ valuePresent = true;
+ } else {
+ present = new BitFieldReader(in, 1);
+ }
+ }
+
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ static IntegerReader createIntegerReader(OrcProto.ColumnEncoding.Kind kind,
+ InStream in,
+ boolean signed, boolean skipCorrupt) throws IOException {
+ switch (kind) {
+ case DIRECT_V2:
+ case DICTIONARY_V2:
+ return new RunLengthIntegerReaderV2(in, signed, skipCorrupt);
+ case DIRECT:
+ case DICTIONARY:
+ return new RunLengthIntegerReader(in, signed);
+ default:
+ throw new IllegalArgumentException("Unknown encoding " + kind);
+ }
+ }
+
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ checkEncoding(stripeFooter.getColumnsList().get(columnId));
+ InStream in = streams.get(new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.PRESENT));
+ if (in == null) {
+ present = null;
+ valuePresent = true;
+ } else {
+ present = new BitFieldReader(in, 1);
+ }
+ }
+
+ /**
+ * Seek to the given position.
+ *
+ * @param index the indexes loaded from the file
+ * @throws IOException
+ */
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ public void seek(PositionProvider index) throws IOException {
+ if (present != null) {
+ present.seek(index);
+ }
+ }
+
+ protected long countNonNulls(long rows) throws IOException {
+ if (present != null) {
+ long result = 0;
+ for (long c = 0; c < rows; ++c) {
+ if (present.next() == 1) {
+ result += 1;
+ }
+ }
+ return result;
+ } else {
+ return rows;
+ }
+ }
+
+ abstract void skipRows(long rows) throws IOException;
+
+ public BitFieldReader getPresent() {
+ return present;
+ }
+ }
+
+ public abstract static class DatumTreeReader extends TreeReader {
+
+ DatumTreeReader(int columnId) throws IOException {
+ super(columnId);
+ }
+
+ protected DatumTreeReader(int columnId, InStream in) throws IOException {
+ super(columnId, in);
+ }
+
+ Datum next() throws IOException {
+ if (present != null) {
+ valuePresent = present.next() == 1;
+ }
+ return NullDatum.get();
+ }
+ }
+
+ public abstract static class RawStringTreeReader extends TreeReader {
+ RawStringTreeReader(int columnId) throws IOException {
+ super(columnId);
+ }
+
+ protected RawStringTreeReader(int columnId, InStream in) throws IOException {
+ super(columnId, in);
+ }
+
+ byte[] next() throws IOException {
+ if (present != null) {
+ valuePresent = present.next() == 1;
+ }
+ return null;
+ }
+ }
+
+ public static class BooleanTreeReader extends DatumTreeReader {
+ protected BitFieldReader reader = null;
+
+ BooleanTreeReader(int columnId) throws IOException {
+ this(columnId, null, null);
+ }
+
+ protected BooleanTreeReader(int columnId, InStream present, InStream data) throws IOException {
+ super(columnId, present);
+ if (data != null) {
+ reader = new BitFieldReader(data, 1);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ reader = new BitFieldReader(streams.get(new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA)), 1);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+ return valuePresent ? DatumFactory.createBool(reader.next() == 1) : NullDatum.get();
+ }
+ }
+
+ public static class ByteTreeReader extends DatumTreeReader {
+ protected RunLengthByteReader reader = null;
+
+ ByteTreeReader(int columnId) throws IOException {
+ this(columnId, null, null);
+ }
+
+ protected ByteTreeReader(int columnId, InStream present, InStream data) throws IOException {
+ super(columnId, present);
+ this.reader = new RunLengthByteReader(data);
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ reader = new RunLengthByteReader(streams.get(new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA)));
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+ return valuePresent ? DatumFactory.createBit(reader.next()) : NullDatum.get();
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class ShortTreeReader extends DatumTreeReader {
+ protected IntegerReader reader = null;
+
+ ShortTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null);
+ }
+
+ protected ShortTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ this.reader = createIntegerReader(encoding.getKind(), data, true, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), true, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+ return valuePresent ? DatumFactory.createInt2((short) reader.next()) : NullDatum.get();
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class InetTreeReader extends DatumTreeReader {
+ protected IntegerReader reader = null;
+
+ InetTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null);
+ }
+
+ protected InetTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ this.reader = createIntegerReader(encoding.getKind(), data, true, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), true, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+ return valuePresent ? DatumFactory.createInet4((int) reader.next()) : NullDatum.get();
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class IntTreeReader extends DatumTreeReader {
+ protected IntegerReader reader = null;
+
+ IntTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null);
+ }
+
+ protected IntTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ this.reader = createIntegerReader(encoding.getKind(), data, true, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), true, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+ return valuePresent ? DatumFactory.createInt4((int) reader.next()) : NullDatum.get();
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class LongTreeReader extends DatumTreeReader {
+ protected IntegerReader reader = null;
+
+ LongTreeReader(int columnId, boolean skipCorrupt) throws IOException {
+ this(columnId, null, null, null, skipCorrupt);
+ }
+
+ protected LongTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding,
+ boolean skipCorrupt)
+ throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ this.reader = createIntegerReader(encoding.getKind(), data, true, skipCorrupt);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), true, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+ return valuePresent ? DatumFactory.createInt8(reader.next()) : NullDatum.get();
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class FloatTreeReader extends DatumTreeReader {
+ protected InStream stream;
+ private final org.apache.orc.impl.SerializationUtils utils;
+
+ FloatTreeReader(int columnId) throws IOException {
+ this(columnId, null, null);
+ }
+
+ protected FloatTreeReader(int columnId, InStream present, InStream data) throws IOException {
+ super(columnId, present);
+ this.utils = new org.apache.orc.impl.SerializationUtils();
+ this.stream = data;
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ stream = streams.get(name);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ stream.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+ return valuePresent ? DatumFactory.createFloat4(utils.readFloat(stream)) : NullDatum.get();
+ }
+
+ @Override
+ protected void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ for (int i = 0; i < items; ++i) {
+ utils.readFloat(stream);
+ }
+ }
+ }
+
+ public static class DoubleTreeReader extends DatumTreeReader {
+ protected InStream stream;
+ private final org.apache.orc.impl.SerializationUtils utils;
+
+ DoubleTreeReader(int columnId) throws IOException {
+ this(columnId, null, null);
+ }
+
+ protected DoubleTreeReader(int columnId, InStream present, InStream data) throws IOException {
+ super(columnId, present);
+ this.utils = new SerializationUtils();
+ this.stream = data;
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ org.apache.orc.impl.StreamName name =
+ new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ stream = streams.get(name);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ stream.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+ return valuePresent ? DatumFactory.createFloat8(utils.readDouble(stream)) : NullDatum.get();
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long len = items * 8;
+ while (len > 0) {
+ len -= stream.skip(len);
+ }
+ }
+ }
+
+ public static class BinaryTreeReader extends DatumTreeReader {
+ protected InStream stream;
+ protected IntegerReader lengths = null;
+ protected final LongColumnVector scratchlcv;
+
+ BinaryTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null, null);
+ }
+
+ protected BinaryTreeReader(int columnId, InStream present, InStream data, InStream length,
+ OrcProto.ColumnEncoding encoding) throws IOException {
+ super(columnId, present);
+ scratchlcv = new LongColumnVector();
+ this.stream = data;
+ if (length != null && encoding != null) {
+ checkEncoding(encoding);
+ this.lengths = createIntegerReader(encoding.getKind(), length, false, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ stream = streams.get(name);
+ lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.LENGTH)), false, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ stream.seek(index);
+ lengths.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+
+ if (valuePresent) {
+ int len = (int) lengths.next();
+ byte[] buf = new byte[len];
+ int offset = 0;
+ while (len > 0) {
+ int written = stream.read(buf, offset, len);
+ if (written < 0) {
+ throw new EOFException("Can't finish byte read from " + stream);
+ }
+ len -= written;
+ offset += written;
+ }
+ return DatumFactory.createBlob(buf);
+ } else {
+ return NullDatum.get();
+ }
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long lengthToSkip = 0;
+ for (int i = 0; i < items; ++i) {
+ lengthToSkip += lengths.next();
+ }
+ while (lengthToSkip > 0) {
+ lengthToSkip -= stream.skip(lengthToSkip);
+ }
+ }
+ }
+
+ public static class TimestampTreeReader extends DatumTreeReader {
+ protected IntegerReader data = null;
+ protected IntegerReader nanos = null;
+ private final boolean skipCorrupt;
+ private Map baseTimestampMap;
+ private long base_timestamp;
+ private final TimeZone readerTimeZone;
+ private TimeZone writerTimeZone;
+ private boolean hasSameTZRules;
+
+ TimestampTreeReader(TableMeta meta, int columnId, boolean skipCorrupt) throws IOException {
+ this(meta, columnId, null, null, null, null, skipCorrupt);
+ }
+
+ protected TimestampTreeReader(TableMeta meta, int columnId, InStream presentStream, InStream dataStream,
+ InStream nanosStream, OrcProto.ColumnEncoding encoding, boolean skipCorrupt)
+ throws IOException {
+ super(columnId, presentStream);
+ this.skipCorrupt = skipCorrupt;
+ this.baseTimestampMap = new HashMap<>();
+ this.readerTimeZone = TimeZone.getDefault();
+ this.writerTimeZone = readerTimeZone;
+ this.hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone);
+ this.base_timestamp = getBaseTimestamp(TimeZone.getTimeZone(meta.getProperty(StorageConstants.TIMEZONE,
+ TajoConstants.DEFAULT_SYSTEM_TIMEZONE)).getID());
+ if (encoding != null) {
+ checkEncoding(encoding);
+
+ if (dataStream != null) {
+ this.data = createIntegerReader(encoding.getKind(), dataStream, true, skipCorrupt);
+ }
+
+ if (nanosStream != null) {
+ this.nanos = createIntegerReader(encoding.getKind(), nanosStream, false, skipCorrupt);
+ }
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ data = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA)), true, skipCorrupt);
+ nanos = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.SECONDARY)), false, skipCorrupt);
+ base_timestamp = getBaseTimestamp(stripeFooter.getWriterTimezone());
+ }
+
+ private long getBaseTimestamp(String timeZoneId) throws IOException {
+ // to make sure new readers read old files in the same way
+ if (timeZoneId == null || timeZoneId.isEmpty()) {
+ timeZoneId = readerTimeZone.getID();
+ }
+
+ if (!baseTimestampMap.containsKey(timeZoneId)) {
+ writerTimeZone = TimeZone.getTimeZone(timeZoneId);
+ hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone);
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ sdf.setTimeZone(writerTimeZone);
+ try {
+ long epoch =
+ sdf.parse(org.apache.orc.impl.WriterImpl.BASE_TIMESTAMP_STRING).getTime() / WriterImpl.MILLIS_PER_SECOND;
+ baseTimestampMap.put(timeZoneId, epoch);
+ return epoch;
+ } catch (ParseException e) {
+ throw new IOException("Unable to create base timestamp", e);
+ } finally {
+ sdf.setTimeZone(readerTimeZone);
+ }
+ }
+
+ return baseTimestampMap.get(timeZoneId);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ data.seek(index);
+ nanos.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+
+ if (valuePresent) {
+ long millis = decodeTimestamp(data.next(), nanos.next(), base_timestamp);
+ long offset = 0;
+ // If reader and writer time zones have different rules, adjust the timezone difference
+ // between reader and writer taking day light savings into account.
+ if (!hasSameTZRules) {
+ offset = writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(millis);
+ }
+ long adjustedMillis = millis + offset;
+
+ // Sometimes the reader timezone might have changed after adding the adjustedMillis.
+ // To account for that change, check for any difference in reader timezone after
+ // adding adjustedMillis. If so use the new offset (offset at adjustedMillis point of time).
+ if (!hasSameTZRules &&
+ (readerTimeZone.getOffset(millis) != readerTimeZone.getOffset(adjustedMillis))) {
+ long newOffset =
+ writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(adjustedMillis);
+ adjustedMillis = millis + newOffset;
+ }
+ return DatumFactory.createTimestamp(DateTimeUtil.javaTimeToJulianTime(adjustedMillis));
+ } else {
+ return NullDatum.get();
+ }
+ }
+
+ private static int parseNanos(long serialized) {
+ int zeros = 7 & (int) serialized;
+ int result = (int) (serialized >>> 3);
+ if (zeros != 0) {
+ for (int i = 0; i <= zeros; ++i) {
+ result *= 10;
+ }
+ }
+ return result;
+ }
+
+ // borrowed from Facebook's TimestampStreamReader
+ private static long decodeTimestamp(long seconds, long serializedNanos, long baseTimestampInSeconds) {
+ long millis = (seconds + baseTimestampInSeconds) * TimeUnit.MILLIS_PER_SECOND;
+ long nanos = parseNanos(serializedNanos);
+
+ // the rounding error exists because java always rounds up when dividing integers
+ // -42001/1000 = -42; and -42001 % 1000 = -1 (+ 1000)
+ // to get the correct value we need
+ // (-42 - 1)*1000 + 999 = -42001
+ // (42)*1000 + 1 = 42001
+ if (millis < 0 && nanos != 0) {
+ millis -= 1000;
+ }
+ // Truncate nanos to millis and add to mills
+ return millis + (nanos / 1_000_000);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ data.skip(items);
+ nanos.skip(items);
+ }
+ }
+
+ public static class DateTreeReader extends DatumTreeReader {
+ protected IntegerReader reader = null;
+
+ DateTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null);
+ }
+
+ protected DateTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding) throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ reader = createIntegerReader(encoding.getKind(), data, true, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), true, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ super.next();
+ return valuePresent ?
+ DatumFactory.createDate((int) reader.next() + DateTimeUtil.DAYS_FROM_JULIAN_TO_EPOCH) : NullDatum.get();
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ /**
+ * A tree reader that will read string columns. At the start of the
+ * stripe, it creates an internal reader based on whether a direct or
+ * dictionary encoding was used.
+ */
+ public static class StringTreeReader extends DatumTreeReader {
+ protected RawStringTreeReader reader;
+
+ StringTreeReader(int columnId) throws IOException {
+ super(columnId);
+ }
+
+ protected StringTreeReader(int columnId, InStream present, InStream data, InStream length,
+ InStream dictionary, OrcProto.ColumnEncoding encoding) throws IOException {
+ super(columnId, present);
+ if (encoding != null) {
+ switch (encoding.getKind()) {
+ case DIRECT:
+ case DIRECT_V2:
+ reader = new StringDirectTreeReader(columnId, present, data, length,
+ encoding.getKind());
+ break;
+ case DICTIONARY:
+ case DICTIONARY_V2:
+ reader = new StringDictionaryTreeReader(columnId, present, data, length, dictionary,
+ encoding);
+ break;
+ default:
+ throw new IllegalArgumentException("Unsupported encoding " +
+ encoding.getKind());
+ }
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ reader.checkEncoding(encoding);
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ // For each stripe, checks the encoding and initializes the appropriate
+ // reader
+ switch (stripeFooter.getColumnsList().get(columnId).getKind()) {
+ case DIRECT:
+ case DIRECT_V2:
+ reader = new StringDirectTreeReader(columnId);
+ break;
+ case DICTIONARY:
+ case DICTIONARY_V2:
+ reader = new StringDictionaryTreeReader(columnId);
+ break;
+ default:
+ throw new IllegalArgumentException("Unsupported encoding " +
+ stripeFooter.getColumnsList().get(columnId).getKind());
+ }
+ reader.startStripe(streams, stripeFooter);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ reader.seek(index);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ reader.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ byte[] bytes = reader.next();
+ return bytes == null ? NullDatum.get() : DatumFactory.createText(bytes);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skipRows(items);
+ }
+ }
+
+ private final static class BasicTextReaderShim {
+ private final InputStream in;
+
+ public BasicTextReaderShim(InputStream in) {
+ this.in = in;
+ }
+
+ public byte[] read(int len) throws IOException {
+ int offset = 0;
+ byte[] bytes = new byte[len];
+ while (len > 0) {
+ int written = in.read(bytes, offset, len);
+ if (written < 0) {
+ throw new EOFException("Can't finish read from " + in + " read "
+ + (offset) + " bytes out of " + bytes.length);
+ }
+ len -= written;
+ offset += written;
+ }
+ return bytes;
+ }
+ }
+
+ /**
+ * A reader for string columns that are direct encoded in the current
+ * stripe.
+ */
+ public static class StringDirectTreeReader extends RawStringTreeReader {
+ protected InStream stream;
+ protected BasicTextReaderShim data;
+ protected IntegerReader lengths;
+ private final LongColumnVector scratchlcv;
+
+ StringDirectTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null, null);
+ }
+
+ protected StringDirectTreeReader(int columnId, InStream present, InStream data,
+ InStream length, OrcProto.ColumnEncoding.Kind encoding) throws IOException {
+ super(columnId, present);
+ this.scratchlcv = new LongColumnVector();
+ this.stream = data;
+ if (length != null && encoding != null) {
+ this.lengths = createIntegerReader(encoding, length, false, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT &&
+ encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ stream = streams.get(name);
+ data = new BasicTextReaderShim(stream);
+
+ lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.LENGTH)),
+ false, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ stream.seek(index);
+ // don't seek data stream
+ lengths.seek(index);
+ }
+
+ @Override
+ byte[] next() throws IOException {
+ super.next();
+ int len = (int) lengths.next();
+ return valuePresent ? data.read(len) : null;
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long lengthToSkip = 0;
+ for (int i = 0; i < items; ++i) {
+ lengthToSkip += lengths.next();
+ }
+
+ while (lengthToSkip > 0) {
+ lengthToSkip -= stream.skip(lengthToSkip);
+ }
+ }
+
+ public IntegerReader getLengths() {
+ return lengths;
+ }
+
+ public InStream getStream() {
+ return stream;
+ }
+ }
+
+ /**
+ * A reader for string columns that are dictionary encoded in the current
+ * stripe.
+ */
+ public static class StringDictionaryTreeReader extends RawStringTreeReader {
+ private org.apache.orc.impl.DynamicByteArray dictionaryBuffer;
+ private int[] dictionaryOffsets;
+ protected IntegerReader reader;
+
+ private byte[] dictionaryBufferInBytesCache = null;
+ private final LongColumnVector scratchlcv;
+ private final Text result = new Text();
+
+ StringDictionaryTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null, null, null);
+ }
+
+ protected StringDictionaryTreeReader(int columnId, InStream present, InStream data,
+ InStream length, InStream dictionary, OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ super(columnId, present);
+ scratchlcv = new LongColumnVector();
+ if (data != null && encoding != null) {
+ this.reader = createIntegerReader(encoding.getKind(), data, false, false);
+ }
+
+ if (dictionary != null && encoding != null) {
+ readDictionaryStream(dictionary);
+ }
+
+ if (length != null && encoding != null) {
+ readDictionaryLengthStream(length, encoding);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY &&
+ encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+
+ // read the dictionary blob
+ org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
+ OrcProto.Stream.Kind.DICTIONARY_DATA);
+ InStream in = streams.get(name);
+ readDictionaryStream(in);
+
+ // read the lengths
+ name = new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.LENGTH);
+ in = streams.get(name);
+ readDictionaryLengthStream(in, stripeFooter.getColumnsList().get(columnId));
+
+ // set up the row reader
+ name = new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), false, false);
+ }
+
+ private void readDictionaryLengthStream(InStream in, OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ int dictionarySize = encoding.getDictionarySize();
+ if (in != null) { // Guard against empty LENGTH stream.
+ IntegerReader lenReader = createIntegerReader(encoding.getKind(), in, false, false);
+ int offset = 0;
+ if (dictionaryOffsets == null ||
+ dictionaryOffsets.length < dictionarySize + 1) {
+ dictionaryOffsets = new int[dictionarySize + 1];
+ }
+ for (int i = 0; i < dictionarySize; ++i) {
+ dictionaryOffsets[i] = offset;
+ offset += (int) lenReader.next();
+ }
+ dictionaryOffsets[dictionarySize] = offset;
+ in.close();
+ }
+
+ }
+
+ private void readDictionaryStream(InStream in) throws IOException {
+ if (in != null) { // Guard against empty dictionary stream.
+ if (in.available() > 0) {
+ dictionaryBuffer = new DynamicByteArray(64, in.available());
+ dictionaryBuffer.readAll(in);
+ // Since its start of strip invalidate the cache.
+ dictionaryBufferInBytesCache = null;
+ }
+ in.close();
+ } else {
+ dictionaryBuffer = null;
+ }
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ byte[] next() throws IOException {
+ super.next();
+ if (valuePresent) {
+ int entry = (int) reader.next();
+ int offset = dictionaryOffsets[entry];
+ int length = getDictionaryEntryLength(entry, offset);
+ // If the column is just empty strings, the size will be zero,
+ // so the buffer will be null, in that case just return result
+ // as it will default to empty
+ if (dictionaryBuffer != null) {
+ dictionaryBuffer.setText(result, offset, length);
+ } else {
+ result.clear();
+ }
+ return result.getBytes();
+ } else {
+ return null;
+ }
+ }
+
+ int getDictionaryEntryLength(int entry, int offset) {
+ final int length;
+ // if it isn't the last entry, subtract the offsets otherwise use
+ // the buffer length.
+ if (entry < dictionaryOffsets.length - 1) {
+ length = dictionaryOffsets[entry + 1] - offset;
+ } else {
+ length = dictionaryBuffer.size() - offset;
+ }
+ return length;
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+
+ public IntegerReader getReader() {
+ return reader;
+ }
+ }
+
+ /**
+ * A tree reader that will read string columns. At the start of the
+ * stripe, it creates an internal reader based on whether a direct or
+ * dictionary encoding was used.
+ */
+ public static class CharTreeReader extends DatumTreeReader {
+ protected RawStringTreeReader reader;
+ private final int maxLength;
+
+ CharTreeReader(int columnId, int maxLength) throws IOException {
+ this(columnId, null, null, null, null, null, maxLength);
+ }
+
+ protected CharTreeReader(int columnId, InStream present, InStream data, InStream length,
+ InStream dictionary, OrcProto.ColumnEncoding encoding, int maxLength) throws IOException {
+ super(columnId, present);
+ this.maxLength = maxLength;
+ if (encoding != null) {
+ switch (encoding.getKind()) {
+ case DIRECT:
+ case DIRECT_V2:
+ reader = new StringDirectTreeReader(columnId, present, data, length,
+ encoding.getKind());
+ break;
+ case DICTIONARY:
+ case DICTIONARY_V2:
+ reader = new StringDictionaryTreeReader(columnId, present, data, length, dictionary,
+ encoding);
+ break;
+ default:
+ throw new IllegalArgumentException("Unsupported encoding " +
+ encoding.getKind());
+ }
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ reader.checkEncoding(encoding);
+ }
+
+ @Override
+ void startStripe(Map streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ // For each stripe, checks the encoding and initializes the appropriate
+ // reader
+ switch (stripeFooter.getColumnsList().get(columnId).getKind()) {
+ case DIRECT:
+ case DIRECT_V2:
+ reader = new StringDirectTreeReader(columnId);
+ break;
+ case DICTIONARY:
+ case DICTIONARY_V2:
+ reader = new StringDictionaryTreeReader(columnId);
+ break;
+ default:
+ throw new IllegalArgumentException("Unsupported encoding " +
+ stripeFooter.getColumnsList().get(columnId).getKind());
+ }
+ reader.startStripe(streams, stripeFooter);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ reader.seek(index);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ reader.seek(index);
+ }
+
+ @Override
+ Datum next() throws IOException {
+ byte[] bytes = reader.next();
+
+ if (bytes == null) {
+ return NullDatum.get();
+ }
+ // TODO: enforce char length
+ return DatumFactory.createChar(bytes);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skipRows(items);
+ }
+ }
+
+// protected static class StructTreeReader extends TreeReader {
+// private final int fileColumnCount;
+// private final int resultColumnCount;
+// protected final TreeReader[] fields;
+// private final String[] fieldNames;
+//
+// protected StructTreeReader(
+// int columnId,
+// TreeReaderSchema treeReaderSchema,
+// boolean[] included,
+// boolean skipCorrupt) throws IOException {
+// super(columnId);
+//
+// OrcProto.Type fileStructType = treeReaderSchema.getFileTypes().get(columnId);
+// fileColumnCount = fileStructType.getFieldNamesCount();
+//
+// OrcProto.Type schemaStructType = treeReaderSchema.getSchemaTypes().get(columnId);
+//
+// if (columnId == treeReaderSchema.getInnerStructSubtype()) {
+// // If there are more result columns than reader columns, we will default those additional
+// // columns to NULL.
+// resultColumnCount = schemaStructType.getFieldNamesCount();
+// } else {
+// resultColumnCount = fileColumnCount;
+// }
+//
+// this.fields = new TreeReader[fileColumnCount];
+// this.fieldNames = new String[fileColumnCount];
+//
+// if (included == null) {
+// for (int i = 0; i < fileColumnCount; ++i) {
+// int subtype = schemaStructType.getSubtypes(i);
+// this.fields[i] = createTreeReader(subtype, treeReaderSchema, included, skipCorrupt);
+// // Use the treeReaderSchema evolution name since file/reader types may not have the real column name.
+// this.fieldNames[i] = schemaStructType.getFieldNames(i);
+// }
+// } else {
+// for (int i = 0; i < fileColumnCount; ++i) {
+// int subtype = schemaStructType.getSubtypes(i);
+// if (subtype >= included.length) {
+// throw new IOException("subtype " + subtype + " exceeds the included array size " +
+// included.length + " fileTypes " + treeReaderSchema.getFileTypes().toString() +
+// " schemaTypes " + treeReaderSchema.getSchemaTypes().toString() +
+// " innerStructSubtype " + treeReaderSchema.getInnerStructSubtype());
+// }
+// if (included[subtype]) {
+// this.fields[i] = createTreeReader(subtype, treeReaderSchema, included, skipCorrupt);
+// }
+// // Use the treeReaderSchema evolution name since file/reader types may not have the real column name.
+// this.fieldNames[i] = schemaStructType.getFieldNames(i);
+// }
+// }
+// }
+//
+// @Override
+// void seek(PositionProvider[] index) throws IOException {
+// super.seek(index);
+// for (TreeReader kid : fields) {
+// if (kid != null) {
+// kid.seek(index);
+// }
+// }
+// }
+//
+// @Override
+// Object next(Object previous) throws IOException {
+// super.next(previous);
+// OrcStruct result = null;
+// if (valuePresent) {
+// if (previous == null) {
+// result = new OrcStruct(resultColumnCount);
+// } else {
+// result = (OrcStruct) previous;
+//
+// // If the input format was initialized with a file with a
+// // different number of fields, the number of fields needs to
+// // be updated to the correct number
+// if (result.getNumFields() != resultColumnCount) {
+// result.setNumFields(resultColumnCount);
+// }
+// }
+// for (int i = 0; i < fileColumnCount; ++i) {
+// if (fields[i] != null) {
+// result.setFieldValue(i, fields[i].next(result.getFieldValue(i)));
+// }
+// }
+// if (resultColumnCount > fileColumnCount) {
+// for (int i = fileColumnCount; i < resultColumnCount; ++i) {
+// // Default new treeReaderSchema evolution fields to NULL.
+// result.setFieldValue(i, null);
+// }
+// }
+// }
+// return result;
+// }
+//
+// @Override
+// void startStripe(Map streams,
+// OrcProto.StripeFooter stripeFooter
+// ) throws IOException {
+// super.startStripe(streams, stripeFooter);
+// for (TreeReader field : fields) {
+// if (field != null) {
+// field.startStripe(streams, stripeFooter);
+// }
+// }
+// }
+//
+// @Override
+// void skipRows(long items) throws IOException {
+// items = countNonNulls(items);
+// for (TreeReader field : fields) {
+// if (field != null) {
+// field.skipRows(items);
+// }
+// }
+// }
+// }
+
+ public static DatumTreeReader createTreeReader(TableMeta meta,
+ int columnId,
+ Column column,
+ boolean skipCorrupt
+ ) throws IOException {
+ TypeDesc typeDesc = column.getTypeDesc();
+ int orcColumnId = columnId + 1; // root record column is considered
+ switch (typeDesc.getDataType().getType()) {
+ case BOOLEAN:
+ return new BooleanTreeReader(orcColumnId);
+ case BIT:
+ return new ByteTreeReader(orcColumnId);
+ case FLOAT8:
+ return new DoubleTreeReader(orcColumnId);
+ case FLOAT4:
+ return new FloatTreeReader(orcColumnId);
+ case INT2:
+ return new ShortTreeReader(orcColumnId);
+ case INT4:
+ return new IntTreeReader(orcColumnId);
+ case INT8:
+ return new LongTreeReader(orcColumnId, skipCorrupt);
+ case TEXT:
+ return new StringTreeReader(orcColumnId);
+ case CHAR:
+ return new CharTreeReader(orcColumnId, typeDesc.getDataType().getLength());
+ case BLOB:
+ return new BinaryTreeReader(orcColumnId);
+ case TIMESTAMP:
+ return new TimestampTreeReader(meta, orcColumnId, skipCorrupt);
+ case DATE:
+ return new DateTreeReader(orcColumnId);
+ case INET4:
+ return new InetTreeReader(orcColumnId);
+// case STRUCT:
+// return new StructTreeReader(columnId, treeReaderSchema, included, skipCorrupt);
+ default:
+ throw new TajoRuntimeException(new UnsupportedException("Unsupported type " +
+ typeDesc.getDataType().getType().name()));
+ }
+ }
+}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java
index 833d102744..4cf008a3a9 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java
@@ -45,6 +45,7 @@
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.apache.hadoop.io.Text;
+import org.apache.tajo.unit.TimeUnit;
import org.apache.tajo.util.datetime.DateTimeUtil;
import java.io.IOException;
@@ -1467,7 +1468,6 @@ void recordPosition(PositionRecorder recorder) throws IOException {
}
}
- static final int MILLIS_PER_SECOND = 1000;
static final String BASE_TIMESTAMP_STRING = "2015-01-01 00:00:00";
private static class TimestampTreeWriter extends TreeWriter {
@@ -1489,7 +1489,7 @@ private static class TimestampTreeWriter extends TreeWriter {
OrcProto.Stream.Kind.SECONDARY), false, isDirectV2, writer);
recordPosition(rowIndexPosition);
// for unit tests to set different time zones
- this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / MILLIS_PER_SECOND;
+ this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / TimeUnit.MILLIS_PER_SECOND;
writer.useWriterTimeZone(true);
timeZone = writer.getTimeZone();
}
@@ -1515,7 +1515,7 @@ void write(Datum datum) throws IOException {
Timestamp val = new Timestamp(javaTimestamp);
indexStatistics.updateTimestamp(val);
- seconds.write((val.getTime() / MILLIS_PER_SECOND) - base_timestamp);
+ seconds.write((val.getTime() / TimeUnit.MILLIS_PER_SECOND) - base_timestamp);
nanos.write(formatNanos(val.getNanos()));
if (createBloomFilter) {
bloomFilter.addLong(val.getTime());
diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java
index b63b497d5b..cc3f46399b 100644
--- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java
+++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java
@@ -38,6 +38,7 @@
import org.apache.tajo.storage.fragment.FileFragment;
import org.apache.tajo.storage.sequencefile.SequenceFileScanner;
import org.apache.tajo.storage.text.DelimitedTextFile;
+import org.apache.tajo.storage.thirdparty.orc.OrcFile.OrcTableProperties;
import org.apache.tajo.util.CommonTestingUtil;
import org.junit.Test;
import org.junit.runner.RunWith;
@@ -61,6 +62,7 @@ public class TestCompressionStorages {
public TestCompressionStorages(String type) throws IOException {
this.dataFormat = type;
conf = new TajoConf();
+ conf.setBoolean("hive.exec.orc.zerocopy", true);
testDir = CommonTestingUtil.getTestDir(TEST_PATH);
fs = testDir.getFileSystem(conf);
@@ -71,7 +73,8 @@ public static Collection
- * The description and format for these types are as below:
- *
- * SHORT_REPEAT: Used for short repeated integer sequences.
- *
- *
1 byte header
- *
- *
2 bits for encoding type
- *
3 bits for bytes required for repeating value
- *
3 bits for repeat count (MIN_REPEAT + run length)
- *
- *
- *
Blob - repeat value (fixed bytes)
- *
- *
- *
- * DIRECT: Used for random integer sequences whose number of bit
- * requirement doesn't vary a lot.
- *
- *
2 bytes header
- *
- * 1st byte
- *
2 bits for encoding type
- *
5 bits for fixed bit width of values in blob
- *
1 bit for storing MSB of run length
- *
- *
- * 2nd byte
- *
8 bits for lower run length bits
- *
- *
- *
Blob - stores the direct values using fixed bit width. The length of the
- * data blob is (fixed width * run length) bits long
- *
- *
- *
- * PATCHED_BASE: Used for random integer sequences whose number of bit
- * requirement varies beyond a threshold.
- *
- *
4 bytes header
- *
- * 1st byte
- *
2 bits for encoding type
- *
5 bits for fixed bit width of values in blob
- *
1 bit for storing MSB of run length
- *
- *
- * 2nd byte
- *
8 bits for lower run length bits
- *
- *
- * 3rd byte
- *
3 bits for bytes required to encode base value
- *
5 bits for patch width
- *
- *
- * 4th byte
- *
3 bits for patch gap width
- *
5 bits for patch length
- *
- *
- *
Base value - Stored using fixed number of bytes. If MSB is set, base
- * value is negative else positive. Length of base value is (base width * 8)
- * bits.
- *
Data blob - Base reduced values as stored using fixed bit width. Length
- * of data blob is (fixed width * run length) bits.
- *
Patch blob - Patch blob is a list of gap and patch value. Each entry in
- * the patch list is (patch width + patch gap width) bits long. Gap between the
- * subsequent elements to be patched are stored in upper part of entry whereas
- * patch values are stored in lower part of entry. Length of patch blob is
- * ((patch width + patch gap width) * patch length) bits.
- *
- *
- *
- * DELTA Used for monotonically increasing or decreasing sequences,
- * sequences with fixed delta values or long repeated sequences.
- *
- *
2 bytes header
- *
- * 1st byte
- *
2 bits for encoding type
- *
5 bits for fixed bit width of values in blob
- *
1 bit for storing MSB of run length
- *
- *
- * 2nd byte
- *
8 bits for lower run length bits
- *
- *
- *
Base value - encoded as varint
- *
Delta base - encoded as varint
- *
Delta blob - only positive values. monotonicity and orderness are decided
- * based on the sign of the base value and delta base
- *
- *
- */
-class RunLengthIntegerWriterV2 implements IntegerWriter {
-
- public enum EncodingType {
- SHORT_REPEAT, DIRECT, PATCHED_BASE, DELTA
- }
-
- static final int MAX_SCOPE = 512;
- static final int MIN_REPEAT = 3;
- private static final int MAX_SHORT_REPEAT_LENGTH = 10;
- private long prevDelta = 0;
- private int fixedRunLength = 0;
- private int variableRunLength = 0;
- private final long[] literals = new long[MAX_SCOPE];
- private final PositionedOutputStream output;
- private final boolean signed;
- private EncodingType encoding;
- private int numLiterals;
- private final long[] zigzagLiterals = new long[MAX_SCOPE];
- private final long[] baseRedLiterals = new long[MAX_SCOPE];
- private final long[] adjDeltas = new long[MAX_SCOPE];
- private long fixedDelta;
- private int zzBits90p;
- private int zzBits100p;
- private int brBits95p;
- private int brBits100p;
- private int bitsDeltaMax;
- private int patchWidth;
- private int patchGapWidth;
- private int patchLength;
- private long[] gapVsPatchList;
- private long min;
- private boolean isFixedDelta;
- private SerializationUtils utils;
- private boolean alignedBitpacking;
-
- RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed) {
- this(output, signed, true);
- }
-
- RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed,
- boolean alignedBitpacking) {
- this.output = output;
- this.signed = signed;
- this.alignedBitpacking = alignedBitpacking;
- this.utils = new SerializationUtils();
- clear();
- }
-
- private void writeValues() throws IOException {
- if (numLiterals != 0) {
-
- if (encoding.equals(EncodingType.SHORT_REPEAT)) {
- writeShortRepeatValues();
- } else if (encoding.equals(EncodingType.DIRECT)) {
- writeDirectValues();
- } else if (encoding.equals(EncodingType.PATCHED_BASE)) {
- writePatchedBaseValues();
- } else {
- writeDeltaValues();
- }
-
- // clear all the variables
- clear();
- }
- }
-
- private void writeDeltaValues() throws IOException {
- int len = 0;
- int fb = bitsDeltaMax;
- int efb = 0;
-
- if (alignedBitpacking) {
- fb = utils.getClosestAlignedFixedBits(fb);
- }
-
- if (isFixedDelta) {
- // if fixed run length is greater than threshold then it will be fixed
- // delta sequence with delta value 0 else fixed delta sequence with
- // non-zero delta value
- if (fixedRunLength > MIN_REPEAT) {
- // ex. sequence: 2 2 2 2 2 2 2 2
- len = fixedRunLength - 1;
- fixedRunLength = 0;
- } else {
- // ex. sequence: 4 6 8 10 12 14 16
- len = variableRunLength - 1;
- variableRunLength = 0;
- }
- } else {
- // fixed width 0 is used for long repeating values.
- // sequences that require only 1 bit to encode will have an additional bit
- if (fb == 1) {
- fb = 2;
- }
- efb = utils.encodeBitWidth(fb);
- efb = efb << 1;
- len = variableRunLength - 1;
- variableRunLength = 0;
- }
-
- // extract the 9th bit of run length
- final int tailBits = (len & 0x100) >>> 8;
-
- // create first byte of the header
- final int headerFirstByte = getOpcode() | efb | tailBits;
-
- // second byte of the header stores the remaining 8 bits of runlength
- final int headerSecondByte = len & 0xff;
-
- // write header
- output.write(headerFirstByte);
- output.write(headerSecondByte);
-
- // store the first value from zigzag literal array
- if (signed) {
- utils.writeVslong(output, literals[0]);
- } else {
- utils.writeVulong(output, literals[0]);
- }
-
- if (isFixedDelta) {
- // if delta is fixed then we don't need to store delta blob
- utils.writeVslong(output, fixedDelta);
- } else {
- // store the first value as delta value using zigzag encoding
- utils.writeVslong(output, adjDeltas[0]);
-
- // adjacent delta values are bit packed. The length of adjDeltas array is
- // always one less than the number of literals (delta difference for n
- // elements is n-1). We have already written one element, write the
- // remaining numLiterals - 2 elements here
- utils.writeInts(adjDeltas, 1, numLiterals - 2, fb, output);
- }
- }
-
- private void writePatchedBaseValues() throws IOException {
-
- // NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding
- // because patch is applied to MSB bits. For example: If fixed bit width of
- // base value is 7 bits and if patch is 3 bits, the actual value is
- // constructed by shifting the patch to left by 7 positions.
- // actual_value = patch << 7 | base_value
- // So, if we align base_value then actual_value can not be reconstructed.
-
- // write the number of fixed bits required in next 5 bits
- final int fb = brBits95p;
- final int efb = utils.encodeBitWidth(fb) << 1;
-
- // adjust variable run length, they are one off
- variableRunLength -= 1;
-
- // extract the 9th bit of run length
- final int tailBits = (variableRunLength & 0x100) >>> 8;
-
- // create first byte of the header
- final int headerFirstByte = getOpcode() | efb | tailBits;
-
- // second byte of the header stores the remaining 8 bits of runlength
- final int headerSecondByte = variableRunLength & 0xff;
-
- // if the min value is negative toggle the sign
- final boolean isNegative = min < 0 ? true : false;
- if (isNegative) {
- min = -min;
- }
-
- // find the number of bytes required for base and shift it by 5 bits
- // to accommodate patch width. The additional bit is used to store the sign
- // of the base value.
- final int baseWidth = utils.findClosestNumBits(min) + 1;
- final int baseBytes = baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1;
- final int bb = (baseBytes - 1) << 5;
-
- // if the base value is negative then set MSB to 1
- if (isNegative) {
- min |= (1L << ((baseBytes * 8) - 1));
- }
-
- // third byte contains 3 bits for number of bytes occupied by base
- // and 5 bits for patchWidth
- final int headerThirdByte = bb | utils.encodeBitWidth(patchWidth);
-
- // fourth byte contains 3 bits for page gap width and 5 bits for
- // patch length
- final int headerFourthByte = (patchGapWidth - 1) << 5 | patchLength;
-
- // write header
- output.write(headerFirstByte);
- output.write(headerSecondByte);
- output.write(headerThirdByte);
- output.write(headerFourthByte);
-
- // write the base value using fixed bytes in big endian order
- for(int i = baseBytes - 1; i >= 0; i--) {
- byte b = (byte) ((min >>> (i * 8)) & 0xff);
- output.write(b);
- }
-
- // base reduced literals are bit packed
- int closestFixedBits = utils.getClosestFixedBits(fb);
-
- utils.writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits,
- output);
-
- // write patch list
- closestFixedBits = utils.getClosestFixedBits(patchGapWidth + patchWidth);
-
- utils.writeInts(gapVsPatchList, 0, gapVsPatchList.length, closestFixedBits,
- output);
-
- // reset run length
- variableRunLength = 0;
- }
-
- /**
- * Store the opcode in 2 MSB bits
- * @return opcode
- */
- private int getOpcode() {
- return encoding.ordinal() << 6;
- }
-
- private void writeDirectValues() throws IOException {
-
- // write the number of fixed bits required in next 5 bits
- int fb = zzBits100p;
-
- if (alignedBitpacking) {
- fb = utils.getClosestAlignedFixedBits(fb);
- }
-
- final int efb = utils.encodeBitWidth(fb) << 1;
-
- // adjust variable run length
- variableRunLength -= 1;
-
- // extract the 9th bit of run length
- final int tailBits = (variableRunLength & 0x100) >>> 8;
-
- // create first byte of the header
- final int headerFirstByte = getOpcode() | efb | tailBits;
-
- // second byte of the header stores the remaining 8 bits of runlength
- final int headerSecondByte = variableRunLength & 0xff;
-
- // write header
- output.write(headerFirstByte);
- output.write(headerSecondByte);
-
- // bit packing the zigzag encoded literals
- utils.writeInts(zigzagLiterals, 0, numLiterals, fb, output);
-
- // reset run length
- variableRunLength = 0;
- }
-
- private void writeShortRepeatValues() throws IOException {
- // get the value that is repeating, compute the bits and bytes required
- long repeatVal = 0;
- if (signed) {
- repeatVal = utils.zigzagEncode(literals[0]);
- } else {
- repeatVal = literals[0];
- }
-
- final int numBitsRepeatVal = utils.findClosestNumBits(repeatVal);
- final int numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? numBitsRepeatVal >>> 3
- : (numBitsRepeatVal >>> 3) + 1;
-
- // write encoding type in top 2 bits
- int header = getOpcode();
-
- // write the number of bytes required for the value
- header |= ((numBytesRepeatVal - 1) << 3);
-
- // write the run length
- fixedRunLength -= MIN_REPEAT;
- header |= fixedRunLength;
-
- // write the header
- output.write(header);
-
- // write the repeating value in big endian byte order
- for(int i = numBytesRepeatVal - 1; i >= 0; i--) {
- int b = (int) ((repeatVal >>> (i * 8)) & 0xff);
- output.write(b);
- }
-
- fixedRunLength = 0;
- }
-
- private void determineEncoding() {
-
- // we need to compute zigzag values for DIRECT encoding if we decide to
- // break early for delta overflows or for shorter runs
- computeZigZagLiterals();
-
- zzBits100p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
-
- // not a big win for shorter runs to determine encoding
- if (numLiterals <= MIN_REPEAT) {
- encoding = EncodingType.DIRECT;
- return;
- }
-
- // DELTA encoding check
-
- // for identifying monotonic sequences
- boolean isIncreasing = true;
- boolean isDecreasing = true;
- this.isFixedDelta = true;
-
- this.min = literals[0];
- long max = literals[0];
- final long initialDelta = literals[1] - literals[0];
- long currDelta = initialDelta;
- long deltaMax = initialDelta;
- this.adjDeltas[0] = initialDelta;
-
- for (int i = 1; i < numLiterals; i++) {
- final long l1 = literals[i];
- final long l0 = literals[i - 1];
- currDelta = l1 - l0;
- min = Math.min(min, l1);
- max = Math.max(max, l1);
-
- isIncreasing &= (l0 <= l1);
- isDecreasing &= (l0 >= l1);
-
- isFixedDelta &= (currDelta == initialDelta);
- if (i > 1) {
- adjDeltas[i - 1] = Math.abs(currDelta);
- deltaMax = Math.max(deltaMax, adjDeltas[i - 1]);
- }
- }
-
- // its faster to exit under delta overflow condition without checking for
- // PATCHED_BASE condition as encoding using DIRECT is faster and has less
- // overhead than PATCHED_BASE
- if (!utils.isSafeSubtract(max, min)) {
- encoding = EncodingType.DIRECT;
- return;
- }
-
- // invariant - subtracting any number from any other in the literals after
- // this point won't overflow
-
- // if initialDelta is 0 then we cannot delta encode as we cannot identify
- // the sign of deltas (increasing or decreasing)
- if (initialDelta != 0) {
-
- // if min is equal to max then the delta is 0, this condition happens for
- // fixed values run >10 which cannot be encoded with SHORT_REPEAT
- if (min == max) {
- assert isFixedDelta : min + "==" + max +
- ", isFixedDelta cannot be false";
- assert currDelta == 0 : min + "==" + max + ", currDelta should be zero";
- fixedDelta = 0;
- encoding = EncodingType.DELTA;
- return;
- }
-
- if (isFixedDelta) {
- assert currDelta == initialDelta
- : "currDelta should be equal to initialDelta for fixed delta encoding";
- encoding = EncodingType.DELTA;
- fixedDelta = currDelta;
- return;
- }
-
- // stores the number of bits required for packing delta blob in
- // delta encoding
- bitsDeltaMax = utils.findClosestNumBits(deltaMax);
-
- // monotonic condition
- if (isIncreasing || isDecreasing) {
- encoding = EncodingType.DELTA;
- return;
- }
- }
-
- // PATCHED_BASE encoding check
-
- // percentile values are computed for the zigzag encoded values. if the
- // number of bit requirement between 90th and 100th percentile varies
- // beyond a threshold then we need to patch the values. if the variation
- // is not significant then we can use direct encoding
-
- zzBits90p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 0.9);
- int diffBitsLH = zzBits100p - zzBits90p;
-
- // if the difference between 90th percentile and 100th percentile fixed
- // bits is > 1 then we need patch the values
- if (diffBitsLH > 1) {
-
- // patching is done only on base reduced values.
- // remove base from literals
- for (int i = 0; i < numLiterals; i++) {
- baseRedLiterals[i] = literals[i] - min;
- }
-
- // 95th percentile width is used to determine max allowed value
- // after which patching will be done
- brBits95p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 0.95);
-
- // 100th percentile is used to compute the max patch width
- brBits100p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 1.0);
-
- // after base reducing the values, if the difference in bits between
- // 95th percentile and 100th percentile value is zero then there
- // is no point in patching the values, in which case we will
- // fallback to DIRECT encoding.
- // The decision to use patched base was based on zigzag values, but the
- // actual patching is done on base reduced literals.
- if ((brBits100p - brBits95p) != 0) {
- encoding = EncodingType.PATCHED_BASE;
- preparePatchedBlob();
- return;
- } else {
- encoding = EncodingType.DIRECT;
- return;
- }
- } else {
- // if difference in bits between 95th percentile and 100th percentile is
- // 0, then patch length will become 0. Hence we will fallback to direct
- encoding = EncodingType.DIRECT;
- return;
- }
- }
-
- private void computeZigZagLiterals() {
- // populate zigzag encoded literals
- long zzEncVal = 0;
- for (int i = 0; i < numLiterals; i++) {
- if (signed) {
- zzEncVal = utils.zigzagEncode(literals[i]);
- } else {
- zzEncVal = literals[i];
- }
- zigzagLiterals[i] = zzEncVal;
- }
- }
-
- private void preparePatchedBlob() {
- // mask will be max value beyond which patch will be generated
- long mask = (1L << brBits95p) - 1;
-
- // since we are considering only 95 percentile, the size of gap and
- // patch array can contain only be 5% values
- patchLength = (int) Math.ceil((numLiterals * 0.05));
-
- int[] gapList = new int[patchLength];
- long[] patchList = new long[patchLength];
-
- // #bit for patch
- patchWidth = brBits100p - brBits95p;
- patchWidth = utils.getClosestFixedBits(patchWidth);
-
- // if patch bit requirement is 64 then it will not possible to pack
- // gap and patch together in a long. To make sure gap and patch can be
- // packed together adjust the patch width
- if (patchWidth == 64) {
- patchWidth = 56;
- brBits95p = 8;
- mask = (1L << brBits95p) - 1;
- }
-
- int gapIdx = 0;
- int patchIdx = 0;
- int prev = 0;
- int gap = 0;
- int maxGap = 0;
-
- for(int i = 0; i < numLiterals; i++) {
- // if value is above mask then create the patch and record the gap
- if (baseRedLiterals[i] > mask) {
- gap = i - prev;
- if (gap > maxGap) {
- maxGap = gap;
- }
-
- // gaps are relative, so store the previous patched value index
- prev = i;
- gapList[gapIdx++] = gap;
-
- // extract the most significant bits that are over mask bits
- long patch = baseRedLiterals[i] >>> brBits95p;
- patchList[patchIdx++] = patch;
-
- // strip off the MSB to enable safe bit packing
- baseRedLiterals[i] &= mask;
- }
- }
-
- // adjust the patch length to number of entries in gap list
- patchLength = gapIdx;
-
- // if the element to be patched is the first and only element then
- // max gap will be 0, but to store the gap as 0 we need atleast 1 bit
- if (maxGap == 0 && patchLength != 0) {
- patchGapWidth = 1;
- } else {
- patchGapWidth = utils.findClosestNumBits(maxGap);
- }
-
- // special case: if the patch gap width is greater than 256, then
- // we need 9 bits to encode the gap width. But we only have 3 bits in
- // header to record the gap width. To deal with this case, we will save
- // two entries in patch list in the following way
- // 256 gap width => 0 for patch value
- // actual gap - 256 => actual patch value
- // We will do the same for gap width = 511. If the element to be patched is
- // the last element in the scope then gap width will be 511. In this case we
- // will have 3 entries in the patch list in the following way
- // 255 gap width => 0 for patch value
- // 255 gap width => 0 for patch value
- // 1 gap width => actual patch value
- if (patchGapWidth > 8) {
- patchGapWidth = 8;
- // for gap = 511, we need two additional entries in patch list
- if (maxGap == 511) {
- patchLength += 2;
- } else {
- patchLength += 1;
- }
- }
-
- // create gap vs patch list
- gapIdx = 0;
- patchIdx = 0;
- gapVsPatchList = new long[patchLength];
- for(int i = 0; i < patchLength; i++) {
- long g = gapList[gapIdx++];
- long p = patchList[patchIdx++];
- while (g > 255) {
- gapVsPatchList[i++] = (255L << patchWidth);
- g -= 255;
- }
-
- // store patch value in LSBs and gap in MSBs
- gapVsPatchList[i] = (g << patchWidth) | p;
- }
- }
-
- /**
- * clears all the variables
- */
- private void clear() {
- numLiterals = 0;
- encoding = null;
- prevDelta = 0;
- fixedDelta = 0;
- zzBits90p = 0;
- zzBits100p = 0;
- brBits95p = 0;
- brBits100p = 0;
- bitsDeltaMax = 0;
- patchGapWidth = 0;
- patchLength = 0;
- patchWidth = 0;
- gapVsPatchList = null;
- min = 0;
- isFixedDelta = true;
- }
-
- @Override
- public void flush() throws IOException {
- if (numLiterals != 0) {
- if (variableRunLength != 0) {
- determineEncoding();
- writeValues();
- } else if (fixedRunLength != 0) {
- if (fixedRunLength < MIN_REPEAT) {
- variableRunLength = fixedRunLength;
- fixedRunLength = 0;
- determineEncoding();
- writeValues();
- } else if (fixedRunLength >= MIN_REPEAT
- && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
- encoding = EncodingType.SHORT_REPEAT;
- writeValues();
- } else {
- encoding = EncodingType.DELTA;
- isFixedDelta = true;
- writeValues();
- }
- }
- }
- output.flush();
- }
-
- @Override
- public void write(long val) throws IOException {
- if (numLiterals == 0) {
- initializeLiterals(val);
- } else {
- if (numLiterals == 1) {
- prevDelta = val - literals[0];
- literals[numLiterals++] = val;
- // if both values are same count as fixed run else variable run
- if (val == literals[0]) {
- fixedRunLength = 2;
- variableRunLength = 0;
- } else {
- fixedRunLength = 0;
- variableRunLength = 2;
- }
- } else {
- long currentDelta = val - literals[numLiterals - 1];
- if (prevDelta == 0 && currentDelta == 0) {
- // fixed delta run
-
- literals[numLiterals++] = val;
-
- // if variable run is non-zero then we are seeing repeating
- // values at the end of variable run in which case keep
- // updating variable and fixed runs
- if (variableRunLength > 0) {
- fixedRunLength = 2;
- }
- fixedRunLength += 1;
-
- // if fixed run met the minimum condition and if variable
- // run is non-zero then flush the variable run and shift the
- // tail fixed runs to start of the buffer
- if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) {
- numLiterals -= MIN_REPEAT;
- variableRunLength -= MIN_REPEAT - 1;
- // copy the tail fixed runs
- long[] tailVals = new long[MIN_REPEAT];
- System.arraycopy(literals, numLiterals, tailVals, 0, MIN_REPEAT);
-
- // determine variable encoding and flush values
- determineEncoding();
- writeValues();
-
- // shift tail fixed runs to beginning of the buffer
- for(long l : tailVals) {
- literals[numLiterals++] = l;
- }
- }
-
- // if fixed runs reached max repeat length then write values
- if (fixedRunLength == MAX_SCOPE) {
- determineEncoding();
- writeValues();
- }
- } else {
- // variable delta run
-
- // if fixed run length is non-zero and if it satisfies the
- // short repeat conditions then write the values as short repeats
- // else use delta encoding
- if (fixedRunLength >= MIN_REPEAT) {
- if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
- encoding = EncodingType.SHORT_REPEAT;
- writeValues();
- } else {
- encoding = EncodingType.DELTA;
- isFixedDelta = true;
- writeValues();
- }
- }
-
- // if fixed run length is 0 && fixedRunLength < MIN_REPEAT) {
- if (val != literals[numLiterals - 1]) {
- variableRunLength = fixedRunLength;
- fixedRunLength = 0;
- }
- }
-
- // after writing values re-initialize the variables
- if (numLiterals == 0) {
- initializeLiterals(val);
- } else {
- // keep updating variable run lengths
- prevDelta = val - literals[numLiterals - 1];
- literals[numLiterals++] = val;
- variableRunLength += 1;
-
- // if variable run length reach the max scope, write it
- if (variableRunLength == MAX_SCOPE) {
- determineEncoding();
- writeValues();
- }
- }
- }
- }
- }
- }
-
- private void initializeLiterals(long val) {
- literals[numLiterals++] = val;
- fixedRunLength = 1;
- variableRunLength = 1;
- }
-
- @Override
- public void getPosition(PositionRecorder recorder) throws IOException {
- output.getPosition(recorder);
- recorder.addPosition(numLiterals);
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SerializationUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SerializationUtils.java
deleted file mode 100644
index 53687b7fdb..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SerializationUtils.java
+++ /dev/null
@@ -1,844 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.thirdparty.orc;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.math.BigInteger;
-
-final class SerializationUtils {
-
- private final static int BUFFER_SIZE = 64;
- private final byte[] readBuffer;
- private final byte[] writeBuffer;
-
- public SerializationUtils() {
- this.readBuffer = new byte[BUFFER_SIZE];
- this.writeBuffer = new byte[BUFFER_SIZE];
- }
-
- void writeVulong(OutputStream output, long value) throws IOException {
- while (true) {
- if ((value & ~0x7f) == 0) {
- output.write((byte) value);
- return;
- } else {
- output.write((byte) (0x80 | (value & 0x7f)));
- value >>>= 7;
- }
- }
- }
-
- void writeVslong(OutputStream output, long value) throws IOException {
- writeVulong(output, (value << 1) ^ (value >> 63));
- }
-
-
- long readVulong(InputStream in) throws IOException {
- long result = 0;
- long b;
- int offset = 0;
- do {
- b = in.read();
- if (b == -1) {
- throw new EOFException("Reading Vulong past EOF");
- }
- result |= (0x7f & b) << offset;
- offset += 7;
- } while (b >= 0x80);
- return result;
- }
-
- long readVslong(InputStream in) throws IOException {
- long result = readVulong(in);
- return (result >>> 1) ^ -(result & 1);
- }
-
- float readFloat(InputStream in) throws IOException {
- int ser = in.read() | (in.read() << 8) | (in.read() << 16) |
- (in.read() << 24);
- return Float.intBitsToFloat(ser);
- }
-
- void writeFloat(OutputStream output, float value) throws IOException {
- int ser = Float.floatToIntBits(value);
- output.write(ser & 0xff);
- output.write((ser >> 8) & 0xff);
- output.write((ser >> 16) & 0xff);
- output.write((ser >> 24) & 0xff);
- }
-
- double readDouble(InputStream in) throws IOException {
- return Double.longBitsToDouble(readLongLE(in));
- }
-
- long readLongLE(InputStream in) throws IOException {
- in.read(readBuffer, 0, 8);
- return (((readBuffer[0] & 0xff) << 0)
- + ((readBuffer[1] & 0xff) << 8)
- + ((readBuffer[2] & 0xff) << 16)
- + ((long) (readBuffer[3] & 0xff) << 24)
- + ((long) (readBuffer[4] & 0xff) << 32)
- + ((long) (readBuffer[5] & 0xff) << 40)
- + ((long) (readBuffer[6] & 0xff) << 48)
- + ((long) (readBuffer[7] & 0xff) << 56));
- }
-
- void writeDouble(OutputStream output, double value) throws IOException {
- writeLongLE(output, Double.doubleToLongBits(value));
- }
-
- private void writeLongLE(OutputStream output, long value) throws IOException {
- writeBuffer[0] = (byte) ((value >> 0) & 0xff);
- writeBuffer[1] = (byte) ((value >> 8) & 0xff);
- writeBuffer[2] = (byte) ((value >> 16) & 0xff);
- writeBuffer[3] = (byte) ((value >> 24) & 0xff);
- writeBuffer[4] = (byte) ((value >> 32) & 0xff);
- writeBuffer[5] = (byte) ((value >> 40) & 0xff);
- writeBuffer[6] = (byte) ((value >> 48) & 0xff);
- writeBuffer[7] = (byte) ((value >> 56) & 0xff);
- output.write(writeBuffer, 0, 8);
- }
-
- /**
- * Write the arbitrarily sized signed BigInteger in vint format.
- *
- * Signed integers are encoded using the low bit as the sign bit using zigzag
- * encoding.
- *
- * Each byte uses the low 7 bits for data and the high bit for stop/continue.
- *
- * Bytes are stored LSB first.
- * @param output the stream to write to
- * @param value the value to output
- * @throws IOException
- */
- static void writeBigInteger(OutputStream output,
- BigInteger value) throws IOException {
- // encode the signed number as a positive integer
- value = value.shiftLeft(1);
- int sign = value.signum();
- if (sign < 0) {
- value = value.negate();
- value = value.subtract(BigInteger.ONE);
- }
- int length = value.bitLength();
- while (true) {
- long lowBits = value.longValue() & 0x7fffffffffffffffL;
- length -= 63;
- // write out the next 63 bits worth of data
- for(int i=0; i < 9; ++i) {
- // if this is the last byte, leave the high bit off
- if (length <= 0 && (lowBits & ~0x7f) == 0) {
- output.write((byte) lowBits);
- return;
- } else {
- output.write((byte) (0x80 | (lowBits & 0x7f)));
- lowBits >>>= 7;
- }
- }
- value = value.shiftRight(63);
- }
- }
-
- /**
- * Read the signed arbitrary sized BigInteger BigInteger in vint format
- * @param input the stream to read from
- * @return the read BigInteger
- * @throws IOException
- */
- static BigInteger readBigInteger(InputStream input) throws IOException {
- BigInteger result = BigInteger.ZERO;
- long work = 0;
- int offset = 0;
- long b;
- do {
- b = input.read();
- if (b == -1) {
- throw new EOFException("Reading BigInteger past EOF from " + input);
- }
- work |= (0x7f & b) << (offset % 63);
- offset += 7;
- // if we've read 63 bits, roll them into the result
- if (offset == 63) {
- result = BigInteger.valueOf(work);
- work = 0;
- } else if (offset % 63 == 0) {
- result = result.or(BigInteger.valueOf(work).shiftLeft(offset-63));
- work = 0;
- }
- } while (b >= 0x80);
- if (work != 0) {
- result = result.or(BigInteger.valueOf(work).shiftLeft((offset/63)*63));
- }
- // convert back to a signed number
- boolean isNegative = result.testBit(0);
- if (isNegative) {
- result = result.add(BigInteger.ONE);
- result = result.negate();
- }
- result = result.shiftRight(1);
- return result;
- }
-
- enum FixedBitSizes {
- ONE, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE,
- THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN,
- TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX,
- TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR;
- }
-
- /**
- * Count the number of bits required to encode the given value
- * @param value
- * @return bits required to store value
- */
- int findClosestNumBits(long value) {
- int count = 0;
- while (value != 0) {
- count++;
- value = value >>> 1;
- }
- return getClosestFixedBits(count);
- }
-
- /**
- * zigzag encode the given value
- * @param val
- * @return zigzag encoded value
- */
- long zigzagEncode(long val) {
- return (val << 1) ^ (val >> 63);
- }
-
- /**
- * zigzag decode the given value
- * @param val
- * @return zizag decoded value
- */
- long zigzagDecode(long val) {
- return (val >>> 1) ^ -(val & 1);
- }
-
- /**
- * Compute the bits required to represent pth percentile value
- * @param data - array
- * @param p - percentile value (>=0.0 to <=1.0)
- * @return pth percentile bits
- */
- int percentileBits(long[] data, int offset, int length, double p) {
- if ((p > 1.0) || (p <= 0.0)) {
- return -1;
- }
-
- // histogram that store the encoded bit requirement for each values.
- // maximum number of bits that can encoded is 32 (refer FixedBitSizes)
- int[] hist = new int[32];
-
- // compute the histogram
- for(int i = offset; i < (offset + length); i++) {
- int idx = encodeBitWidth(findClosestNumBits(data[i]));
- hist[idx] += 1;
- }
-
- int perLen = (int) (length * (1.0 - p));
-
- // return the bits required by pth percentile length
- for(int i = hist.length - 1; i >= 0; i--) {
- perLen -= hist[i];
- if (perLen < 0) {
- return decodeBitWidth(i);
- }
- }
-
- return 0;
- }
-
- /**
- * Calculate the number of bytes required
- * @param n - number of values
- * @param numBits - bit width
- * @return number of bytes required
- */
- int getTotalBytesRequired(int n, int numBits) {
- return (n * numBits + 7) / 8;
- }
-
- /**
- * For a given fixed bit this function will return the closest available fixed
- * bit
- * @param n
- * @return closest valid fixed bit
- */
- int getClosestFixedBits(int n) {
- if (n == 0) {
- return 1;
- }
-
- if (n >= 1 && n <= 24) {
- return n;
- } else if (n > 24 && n <= 26) {
- return 26;
- } else if (n > 26 && n <= 28) {
- return 28;
- } else if (n > 28 && n <= 30) {
- return 30;
- } else if (n > 30 && n <= 32) {
- return 32;
- } else if (n > 32 && n <= 40) {
- return 40;
- } else if (n > 40 && n <= 48) {
- return 48;
- } else if (n > 48 && n <= 56) {
- return 56;
- } else {
- return 64;
- }
- }
-
- public int getClosestAlignedFixedBits(int n) {
- if (n == 0 || n == 1) {
- return 1;
- } else if (n > 1 && n <= 2) {
- return 2;
- } else if (n > 2 && n <= 4) {
- return 4;
- } else if (n > 4 && n <= 8) {
- return 8;
- } else if (n > 8 && n <= 16) {
- return 16;
- } else if (n > 16 && n <= 24) {
- return 24;
- } else if (n > 24 && n <= 32) {
- return 32;
- } else if (n > 32 && n <= 40) {
- return 40;
- } else if (n > 40 && n <= 48) {
- return 48;
- } else if (n > 48 && n <= 56) {
- return 56;
- } else {
- return 64;
- }
- }
-
- /**
- * Finds the closest available fixed bit width match and returns its encoded
- * value (ordinal)
- * @param n - fixed bit width to encode
- * @return encoded fixed bit width
- */
- int encodeBitWidth(int n) {
- n = getClosestFixedBits(n);
-
- if (n >= 1 && n <= 24) {
- return n - 1;
- } else if (n > 24 && n <= 26) {
- return FixedBitSizes.TWENTYSIX.ordinal();
- } else if (n > 26 && n <= 28) {
- return FixedBitSizes.TWENTYEIGHT.ordinal();
- } else if (n > 28 && n <= 30) {
- return FixedBitSizes.THIRTY.ordinal();
- } else if (n > 30 && n <= 32) {
- return FixedBitSizes.THIRTYTWO.ordinal();
- } else if (n > 32 && n <= 40) {
- return FixedBitSizes.FORTY.ordinal();
- } else if (n > 40 && n <= 48) {
- return FixedBitSizes.FORTYEIGHT.ordinal();
- } else if (n > 48 && n <= 56) {
- return FixedBitSizes.FIFTYSIX.ordinal();
- } else {
- return FixedBitSizes.SIXTYFOUR.ordinal();
- }
- }
-
- /**
- * Decodes the ordinal fixed bit value to actual fixed bit width value
- * @param n - encoded fixed bit width
- * @return decoded fixed bit width
- */
- int decodeBitWidth(int n) {
- if (n >= FixedBitSizes.ONE.ordinal()
- && n <= FixedBitSizes.TWENTYFOUR.ordinal()) {
- return n + 1;
- } else if (n == FixedBitSizes.TWENTYSIX.ordinal()) {
- return 26;
- } else if (n == FixedBitSizes.TWENTYEIGHT.ordinal()) {
- return 28;
- } else if (n == FixedBitSizes.THIRTY.ordinal()) {
- return 30;
- } else if (n == FixedBitSizes.THIRTYTWO.ordinal()) {
- return 32;
- } else if (n == FixedBitSizes.FORTY.ordinal()) {
- return 40;
- } else if (n == FixedBitSizes.FORTYEIGHT.ordinal()) {
- return 48;
- } else if (n == FixedBitSizes.FIFTYSIX.ordinal()) {
- return 56;
- } else {
- return 64;
- }
- }
-
- /**
- * Bitpack and write the input values to underlying output stream
- * @param input - values to write
- * @param offset - offset
- * @param len - length
- * @param bitSize - bit width
- * @param output - output stream
- * @throws IOException
- */
- void writeInts(long[] input, int offset, int len, int bitSize,
- OutputStream output) throws IOException {
- if (input == null || input.length < 1 || offset < 0 || len < 1
- || bitSize < 1) {
- return;
- }
-
- switch (bitSize) {
- case 1:
- unrolledBitPack1(input, offset, len, output);
- return;
- case 2:
- unrolledBitPack2(input, offset, len, output);
- return;
- case 4:
- unrolledBitPack4(input, offset, len, output);
- return;
- case 8:
- unrolledBitPack8(input, offset, len, output);
- return;
- case 16:
- unrolledBitPack16(input, offset, len, output);
- return;
- case 24:
- unrolledBitPack24(input, offset, len, output);
- return;
- case 32:
- unrolledBitPack32(input, offset, len, output);
- return;
- case 40:
- unrolledBitPack40(input, offset, len, output);
- return;
- case 48:
- unrolledBitPack48(input, offset, len, output);
- return;
- case 56:
- unrolledBitPack56(input, offset, len, output);
- return;
- case 64:
- unrolledBitPack64(input, offset, len, output);
- return;
- default:
- break;
- }
-
- int bitsLeft = 8;
- byte current = 0;
- for(int i = offset; i < (offset + len); i++) {
- long value = input[i];
- int bitsToWrite = bitSize;
- while (bitsToWrite > bitsLeft) {
- // add the bits to the bottom of the current word
- current |= value >>> (bitsToWrite - bitsLeft);
- // subtract out the bits we just added
- bitsToWrite -= bitsLeft;
- // zero out the bits above bitsToWrite
- value &= (1L << bitsToWrite) - 1;
- output.write(current);
- current = 0;
- bitsLeft = 8;
- }
- bitsLeft -= bitsToWrite;
- current |= value << bitsLeft;
- if (bitsLeft == 0) {
- output.write(current);
- current = 0;
- bitsLeft = 8;
- }
- }
-
- // flush
- if (bitsLeft != 8) {
- output.write(current);
- current = 0;
- bitsLeft = 8;
- }
- }
-
- private void unrolledBitPack1(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- final int numHops = 8;
- final int remainder = len % numHops;
- final int endOffset = offset + len;
- final int endUnroll = endOffset - remainder;
- int val = 0;
- for (int i = offset; i < endUnroll; i = i + numHops) {
- val = (int) (val | ((input[i] & 1) << 7)
- | ((input[i + 1] & 1) << 6)
- | ((input[i + 2] & 1) << 5)
- | ((input[i + 3] & 1) << 4)
- | ((input[i + 4] & 1) << 3)
- | ((input[i + 5] & 1) << 2)
- | ((input[i + 6] & 1) << 1)
- | (input[i + 7]) & 1);
- output.write(val);
- val = 0;
- }
-
- if (remainder > 0) {
- int startShift = 7;
- for (int i = endUnroll; i < endOffset; i++) {
- val = (int) (val | (input[i] & 1) << startShift);
- startShift -= 1;
- }
- output.write(val);
- }
- }
-
- private void unrolledBitPack2(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- final int numHops = 4;
- final int remainder = len % numHops;
- final int endOffset = offset + len;
- final int endUnroll = endOffset - remainder;
- int val = 0;
- for (int i = offset; i < endUnroll; i = i + numHops) {
- val = (int) (val | ((input[i] & 3) << 6)
- | ((input[i + 1] & 3) << 4)
- | ((input[i + 2] & 3) << 2)
- | (input[i + 3]) & 3);
- output.write(val);
- val = 0;
- }
-
- if (remainder > 0) {
- int startShift = 6;
- for (int i = endUnroll; i < endOffset; i++) {
- val = (int) (val | (input[i] & 3) << startShift);
- startShift -= 2;
- }
- output.write(val);
- }
- }
-
- private void unrolledBitPack4(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- final int numHops = 2;
- final int remainder = len % numHops;
- final int endOffset = offset + len;
- final int endUnroll = endOffset - remainder;
- int val = 0;
- for (int i = offset; i < endUnroll; i = i + numHops) {
- val = (int) (val | ((input[i] & 15) << 4) | (input[i + 1]) & 15);
- output.write(val);
- val = 0;
- }
-
- if (remainder > 0) {
- int startShift = 4;
- for (int i = endUnroll; i < endOffset; i++) {
- val = (int) (val | (input[i] & 15) << startShift);
- startShift -= 4;
- }
- output.write(val);
- }
- }
-
- private void unrolledBitPack8(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 1);
- }
-
- private void unrolledBitPack16(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 2);
- }
-
- private void unrolledBitPack24(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 3);
- }
-
- private void unrolledBitPack32(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 4);
- }
-
- private void unrolledBitPack40(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 5);
- }
-
- private void unrolledBitPack48(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 6);
- }
-
- private void unrolledBitPack56(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 7);
- }
-
- private void unrolledBitPack64(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 8);
- }
-
- private void unrolledBitPackBytes(long[] input, int offset, int len, OutputStream output, int numBytes) throws IOException {
- final int numHops = 8;
- final int remainder = len % numHops;
- final int endOffset = offset + len;
- final int endUnroll = endOffset - remainder;
- int i = offset;
- for (; i < endUnroll; i = i + numHops) {
- writeLongBE(output, input, i, numHops, numBytes);
- }
-
- if (remainder > 0) {
- writeRemainingLongs(output, i, input, remainder, numBytes);
- }
- }
-
- private void writeRemainingLongs(OutputStream output, int offset, long[] input, int remainder,
- int numBytes) throws IOException {
- final int numHops = remainder;
-
- int idx = 0;
- switch (numBytes) {
- case 1:
- while (remainder > 0) {
- writeBuffer[idx] = (byte) (input[offset + idx] & 255);
- remainder--;
- idx++;
- }
- break;
- case 2:
- while (remainder > 0) {
- writeLongBE2(output, input[offset + idx], idx * 2);
- remainder--;
- idx++;
- }
- break;
- case 3:
- while (remainder > 0) {
- writeLongBE3(output, input[offset + idx], idx * 3);
- remainder--;
- idx++;
- }
- break;
- case 4:
- while (remainder > 0) {
- writeLongBE4(output, input[offset + idx], idx * 4);
- remainder--;
- idx++;
- }
- break;
- case 5:
- while (remainder > 0) {
- writeLongBE5(output, input[offset + idx], idx * 5);
- remainder--;
- idx++;
- }
- break;
- case 6:
- while (remainder > 0) {
- writeLongBE6(output, input[offset + idx], idx * 6);
- remainder--;
- idx++;
- }
- break;
- case 7:
- while (remainder > 0) {
- writeLongBE7(output, input[offset + idx], idx * 7);
- remainder--;
- idx++;
- }
- break;
- case 8:
- while (remainder > 0) {
- writeLongBE8(output, input[offset + idx], idx * 8);
- remainder--;
- idx++;
- }
- break;
- default:
- break;
- }
-
- final int toWrite = numHops * numBytes;
- output.write(writeBuffer, 0, toWrite);
- }
-
- private void writeLongBE(OutputStream output, long[] input, int offset, int numHops, int numBytes) throws IOException {
-
- switch (numBytes) {
- case 1:
- writeBuffer[0] = (byte) (input[offset + 0] & 255);
- writeBuffer[1] = (byte) (input[offset + 1] & 255);
- writeBuffer[2] = (byte) (input[offset + 2] & 255);
- writeBuffer[3] = (byte) (input[offset + 3] & 255);
- writeBuffer[4] = (byte) (input[offset + 4] & 255);
- writeBuffer[5] = (byte) (input[offset + 5] & 255);
- writeBuffer[6] = (byte) (input[offset + 6] & 255);
- writeBuffer[7] = (byte) (input[offset + 7] & 255);
- break;
- case 2:
- writeLongBE2(output, input[offset + 0], 0);
- writeLongBE2(output, input[offset + 1], 2);
- writeLongBE2(output, input[offset + 2], 4);
- writeLongBE2(output, input[offset + 3], 6);
- writeLongBE2(output, input[offset + 4], 8);
- writeLongBE2(output, input[offset + 5], 10);
- writeLongBE2(output, input[offset + 6], 12);
- writeLongBE2(output, input[offset + 7], 14);
- break;
- case 3:
- writeLongBE3(output, input[offset + 0], 0);
- writeLongBE3(output, input[offset + 1], 3);
- writeLongBE3(output, input[offset + 2], 6);
- writeLongBE3(output, input[offset + 3], 9);
- writeLongBE3(output, input[offset + 4], 12);
- writeLongBE3(output, input[offset + 5], 15);
- writeLongBE3(output, input[offset + 6], 18);
- writeLongBE3(output, input[offset + 7], 21);
- break;
- case 4:
- writeLongBE4(output, input[offset + 0], 0);
- writeLongBE4(output, input[offset + 1], 4);
- writeLongBE4(output, input[offset + 2], 8);
- writeLongBE4(output, input[offset + 3], 12);
- writeLongBE4(output, input[offset + 4], 16);
- writeLongBE4(output, input[offset + 5], 20);
- writeLongBE4(output, input[offset + 6], 24);
- writeLongBE4(output, input[offset + 7], 28);
- break;
- case 5:
- writeLongBE5(output, input[offset + 0], 0);
- writeLongBE5(output, input[offset + 1], 5);
- writeLongBE5(output, input[offset + 2], 10);
- writeLongBE5(output, input[offset + 3], 15);
- writeLongBE5(output, input[offset + 4], 20);
- writeLongBE5(output, input[offset + 5], 25);
- writeLongBE5(output, input[offset + 6], 30);
- writeLongBE5(output, input[offset + 7], 35);
- break;
- case 6:
- writeLongBE6(output, input[offset + 0], 0);
- writeLongBE6(output, input[offset + 1], 6);
- writeLongBE6(output, input[offset + 2], 12);
- writeLongBE6(output, input[offset + 3], 18);
- writeLongBE6(output, input[offset + 4], 24);
- writeLongBE6(output, input[offset + 5], 30);
- writeLongBE6(output, input[offset + 6], 36);
- writeLongBE6(output, input[offset + 7], 42);
- break;
- case 7:
- writeLongBE7(output, input[offset + 0], 0);
- writeLongBE7(output, input[offset + 1], 7);
- writeLongBE7(output, input[offset + 2], 14);
- writeLongBE7(output, input[offset + 3], 21);
- writeLongBE7(output, input[offset + 4], 28);
- writeLongBE7(output, input[offset + 5], 35);
- writeLongBE7(output, input[offset + 6], 42);
- writeLongBE7(output, input[offset + 7], 49);
- break;
- case 8:
- writeLongBE8(output, input[offset + 0], 0);
- writeLongBE8(output, input[offset + 1], 8);
- writeLongBE8(output, input[offset + 2], 16);
- writeLongBE8(output, input[offset + 3], 24);
- writeLongBE8(output, input[offset + 4], 32);
- writeLongBE8(output, input[offset + 5], 40);
- writeLongBE8(output, input[offset + 6], 48);
- writeLongBE8(output, input[offset + 7], 56);
- break;
- default:
- break;
- }
-
- final int toWrite = numHops * numBytes;
- output.write(writeBuffer, 0, toWrite);
- }
-
- private void writeLongBE2(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 0);
- }
-
- private void writeLongBE3(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 16);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 2] = (byte) (val >>> 0);
- }
-
- private void writeLongBE4(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 24);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 16);
- writeBuffer[wbOffset + 2] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 3] = (byte) (val >>> 0);
- }
-
- private void writeLongBE5(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 32);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 24);
- writeBuffer[wbOffset + 2] = (byte) (val >>> 16);
- writeBuffer[wbOffset + 3] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 4] = (byte) (val >>> 0);
- }
-
- private void writeLongBE6(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 40);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 32);
- writeBuffer[wbOffset + 2] = (byte) (val >>> 24);
- writeBuffer[wbOffset + 3] = (byte) (val >>> 16);
- writeBuffer[wbOffset + 4] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 5] = (byte) (val >>> 0);
- }
-
- private void writeLongBE7(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 48);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 40);
- writeBuffer[wbOffset + 2] = (byte) (val >>> 32);
- writeBuffer[wbOffset + 3] = (byte) (val >>> 24);
- writeBuffer[wbOffset + 4] = (byte) (val >>> 16);
- writeBuffer[wbOffset + 5] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 6] = (byte) (val >>> 0);
- }
-
- private void writeLongBE8(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 56);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 48);
- writeBuffer[wbOffset + 2] = (byte) (val >>> 40);
- writeBuffer[wbOffset + 3] = (byte) (val >>> 32);
- writeBuffer[wbOffset + 4] = (byte) (val >>> 24);
- writeBuffer[wbOffset + 5] = (byte) (val >>> 16);
- writeBuffer[wbOffset + 6] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 7] = (byte) (val >>> 0);
- }
-
- // Do not want to use Guava LongMath.checkedSubtract() here as it will throw
- // ArithmeticException in case of overflow
- public boolean isSafeSubtract(long left, long right) {
- return (left ^ right) >= 0 | (left ^ (left - right)) >= 0;
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SnappyCodec.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SnappyCodec.java
deleted file mode 100644
index 285a32aeb8..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SnappyCodec.java
+++ /dev/null
@@ -1,109 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.thirdparty.orc;
-
-import org.apache.hadoop.hive.shims.HadoopShims.DirectCompressionType;
-import org.apache.hadoop.hive.shims.HadoopShims.DirectDecompressorShim;
-import org.apache.hadoop.hive.shims.ShimLoader;
-import org.iq80.snappy.Snappy;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.EnumSet;
-
-class SnappyCodec implements CompressionCodec, DirectDecompressionCodec {
-
- Boolean direct = null;
-
- @Override
- public boolean compress(ByteBuffer in, ByteBuffer out,
- ByteBuffer overflow) throws IOException {
- int inBytes = in.remaining();
- // I should work on a patch for Snappy to support an overflow buffer
- // to prevent the extra buffer copy.
- byte[] compressed = new byte[Snappy.maxCompressedLength(inBytes)];
- int outBytes =
- Snappy.compress(in.array(), in.arrayOffset() + in.position(), inBytes,
- compressed, 0);
- if (outBytes < inBytes) {
- int remaining = out.remaining();
- if (remaining >= outBytes) {
- System.arraycopy(compressed, 0, out.array(), out.arrayOffset() +
- out.position(), outBytes);
- out.position(out.position() + outBytes);
- } else {
- System.arraycopy(compressed, 0, out.array(), out.arrayOffset() +
- out.position(), remaining);
- out.position(out.limit());
- System.arraycopy(compressed, remaining, overflow.array(),
- overflow.arrayOffset(), outBytes - remaining);
- overflow.position(outBytes - remaining);
- }
- return true;
- } else {
- return false;
- }
- }
-
- @Override
- public void decompress(ByteBuffer in, ByteBuffer out) throws IOException {
- if(in.isDirect() && out.isDirect()) {
- directDecompress(in, out);
- return;
- }
- int inOffset = in.position();
- int uncompressLen =
- Snappy.uncompress(in.array(), in.arrayOffset() + inOffset,
- in.limit() - inOffset, out.array(), out.arrayOffset() + out.position());
- out.position(uncompressLen + out.position());
- out.flip();
- }
-
- @Override
- public boolean isAvailable() {
- if (direct == null) {
- try {
- if (ShimLoader.getHadoopShims().getDirectDecompressor(
- DirectCompressionType.SNAPPY) != null) {
- direct = Boolean.valueOf(true);
- } else {
- direct = Boolean.valueOf(false);
- }
- } catch (UnsatisfiedLinkError ule) {
- direct = Boolean.valueOf(false);
- }
- }
- return direct.booleanValue();
- }
-
- @Override
- public void directDecompress(ByteBuffer in, ByteBuffer out)
- throws IOException {
- DirectDecompressorShim decompressShim = ShimLoader.getHadoopShims()
- .getDirectDecompressor(DirectCompressionType.SNAPPY);
- decompressShim.decompress(in, out);
- out.flip(); // flip for read
- }
-
- @Override
- public CompressionCodec modify(EnumSet modifiers) {
- // snappy allows no modifications
- return this;
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamName.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamName.java
deleted file mode 100644
index 382164530c..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamName.java
+++ /dev/null
@@ -1,95 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.thirdparty.orc;
-
-/**
- * The name of a stream within a stripe.
- */
-class StreamName implements Comparable {
- private final int column;
- private final OrcProto.Stream.Kind kind;
-
- public enum Area {
- DATA, INDEX
- }
-
- public StreamName(int column, OrcProto.Stream.Kind kind) {
- this.column = column;
- this.kind = kind;
- }
-
- public boolean equals(Object obj) {
- if (obj != null && obj instanceof StreamName) {
- StreamName other = (StreamName) obj;
- return other.column == column && other.kind == kind;
- } else {
- return false;
- }
- }
-
- @Override
- public int compareTo(StreamName streamName) {
- if (streamName == null) {
- return -1;
- }
- Area area = getArea(kind);
- Area otherArea = StreamName.getArea(streamName.kind);
- if (area != otherArea) {
- return -area.compareTo(otherArea);
- }
- if (column != streamName.column) {
- return column < streamName.column ? -1 : 1;
- }
- return kind.compareTo(streamName.kind);
- }
-
- public int getColumn() {
- return column;
- }
-
- public OrcProto.Stream.Kind getKind() {
- return kind;
- }
-
- public Area getArea() {
- return getArea(kind);
- }
-
- public static Area getArea(OrcProto.Stream.Kind kind) {
- switch (kind) {
- case ROW_INDEX:
- case DICTIONARY_COUNT:
- case BLOOM_FILTER:
- return Area.INDEX;
- default:
- return Area.DATA;
- }
- }
-
- @Override
- public String toString() {
- return "Stream for column " + column + " kind " + kind;
- }
-
- @Override
- public int hashCode() {
- return column * 101 + kind.getNumber();
- }
-}
-
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringColumnStatistics.java
deleted file mode 100644
index 42486646bf..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringColumnStatistics.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tajo.storage.thirdparty.orc;
-
-/**
- * Statistics for string columns.
- */
-public interface StringColumnStatistics extends ColumnStatistics {
- /**
- * Get the minimum string.
- * @return the minimum
- */
- String getMinimum();
-
- /**
- * Get the maximum string.
- * @return the maximum
- */
- String getMaximum();
-
- /**
- * Get the total length of all strings
- * @return the sum (total length)
- */
- long getSum();
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringRedBlackTree.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringRedBlackTree.java
deleted file mode 100644
index 8835cefa5e..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringRedBlackTree.java
+++ /dev/null
@@ -1,202 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tajo.storage.thirdparty.orc;
-
-import org.apache.hadoop.io.Text;
-
-import java.io.IOException;
-import java.io.OutputStream;
-
-/**
- * A red-black tree that stores strings. The strings are stored as UTF-8 bytes
- * and an offset for each entry.
- */
-class StringRedBlackTree extends RedBlackTree {
- private final DynamicByteArray byteArray = new DynamicByteArray();
- private final DynamicIntArray keyOffsets;
- private String newKey;
-
- public StringRedBlackTree(int initialCapacity) {
- super(initialCapacity);
- keyOffsets = new DynamicIntArray(initialCapacity);
- }
-
- public int add(String value) {
- newKey = value;
- return addNewKey();
- }
-
- private int addNewKey() {
- // if the newKey is actually new, add it to our byteArray and store the offset & length
- if (add()) {
- int len = newKey.length();
- keyOffsets.add(byteArray.add(newKey.getBytes(), 0, len));
- }
- return lastAdd;
- }
-
- public int add(Text value) {
- newKey = value.toString();
- return addNewKey();
- }
-
- @Override
- protected int compareValue(int position) {
- int start = keyOffsets.get(position);
- int end;
- if (position + 1 == keyOffsets.size()) {
- end = byteArray.size();
- } else {
- end = keyOffsets.get(position+1);
- }
- return byteArray.compare(newKey.getBytes(), 0, newKey.length(),
- start, end - start);
- }
-
- /**
- * The information about each node.
- */
- public interface VisitorContext {
- /**
- * Get the position where the key was originally added.
- * @return the number returned by add.
- */
- int getOriginalPosition();
-
- /**
- * Write the bytes for the string to the given output stream.
- * @param out the stream to write to.
- * @throws IOException
- */
- void writeBytes(OutputStream out) throws IOException;
-
- /**
- * Get the original string.
- * @return the string
- */
- Text getText();
-
- /**
- * Get the number of bytes.
- * @return the string's length in bytes
- */
- int getLength();
- }
-
- /**
- * The interface for visitors.
- */
- public interface Visitor {
- /**
- * Called once for each node of the tree in sort order.
- * @param context the information about each node
- * @throws IOException
- */
- void visit(VisitorContext context) throws IOException;
- }
-
- private class VisitorContextImpl implements VisitorContext {
- private int originalPosition;
- private int start;
- private int end;
- private final Text text = new Text();
-
- public int getOriginalPosition() {
- return originalPosition;
- }
-
- public Text getText() {
- byteArray.setText(text, start, end - start);
- return text;
- }
-
- public void writeBytes(OutputStream out) throws IOException {
- byteArray.write(out, start, end - start);
- }
-
- public int getLength() {
- return end - start;
- }
-
- void setPosition(int position) {
- originalPosition = position;
- start = keyOffsets.get(originalPosition);
- if (position + 1 == keyOffsets.size()) {
- end = byteArray.size();
- } else {
- end = keyOffsets.get(originalPosition + 1);
- }
- }
- }
-
- private void recurse(int node, Visitor visitor, VisitorContextImpl context
- ) throws IOException {
- if (node != NULL) {
- recurse(getLeft(node), visitor, context);
- context.setPosition(node);
- visitor.visit(context);
- recurse(getRight(node), visitor, context);
- }
- }
-
- /**
- * Visit all of the nodes in the tree in sorted order.
- * @param visitor the action to be applied to each node
- * @throws IOException
- */
- public void visit(Visitor visitor) throws IOException {
- recurse(root, visitor, new VisitorContextImpl());
- }
-
- /**
- * Reset the table to empty.
- */
- public void clear() {
- super.clear();
- byteArray.clear();
- keyOffsets.clear();
- }
-
- public void getText(Text result, int originalPosition) {
- int offset = keyOffsets.get(originalPosition);
- int length;
- if (originalPosition + 1 == keyOffsets.size()) {
- length = byteArray.size() - offset;
- } else {
- length = keyOffsets.get(originalPosition + 1) - offset;
- }
- byteArray.setText(result, offset, length);
- }
-
- /**
- * Get the size of the character data in the table.
- * @return the bytes used by the table
- */
- public int getCharacterSize() {
- return byteArray.size();
- }
-
- /**
- * Calculate the approximate size in memory.
- * @return the number of bytes used in storing the tree.
- */
- public long getSizeInBytes() {
- return byteArray.getSizeInBytes() + keyOffsets.getSizeInBytes() +
- super.getSizeInBytes();
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeInformation.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeInformation.java
deleted file mode 100644
index 62819c1a22..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeInformation.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tajo.storage.thirdparty.orc;
-
-/**
- * Information about the stripes in an ORC file that is provided by the Reader.
- */
-public interface StripeInformation {
- /**
- * Get the byte offset of the start of the stripe.
- * @return the bytes from the start of the file
- */
- long getOffset();
-
- /**
- * Get the total length of the stripe in bytes.
- * @return the number of bytes in the stripe
- */
- long getLength();
-
- /**
- * Get the length of the stripe's indexes.
- * @return the number of bytes in the index
- */
- long getIndexLength();
-
- /**
- * Get the length of the stripe's data.
- * @return the number of bytes in the stripe
- */
- long getDataLength();
-
- /**
- * Get the length of the stripe's tail section, which contains its index.
- * @return the number of bytes in the tail
- */
- long getFooterLength();
-
- /**
- * Get the number of rows in the stripe.
- * @return a count of the number of rows
- */
- long getNumberOfRows();
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeStatistics.java
deleted file mode 100644
index 013fc8ec80..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeStatistics.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.thirdparty.orc;
-
-import java.util.List;
-
-public class StripeStatistics {
- private final List cs;
-
- StripeStatistics(List list) {
- this.cs = list;
- }
-
- /**
- * Return list of column statistics
- *
- * @return column stats
- */
- public ColumnStatistics[] getColumnStatistics() {
- ColumnStatistics[] result = new ColumnStatistics[cs.size()];
- for (int i = 0; i < result.length; ++i) {
- result[i] = ColumnStatisticsImpl.deserialize(cs.get(i));
- }
- return result;
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TimestampColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TimestampColumnStatistics.java
deleted file mode 100644
index 6fad0ac1fe..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TimestampColumnStatistics.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.thirdparty.orc;
-
-import java.sql.Timestamp;
-
-/**
- * Statistics for Timestamp columns.
- */
-public interface TimestampColumnStatistics extends ColumnStatistics {
- /**
- * Get the minimum value for the column.
- * @return minimum value
- */
- Timestamp getMinimum();
-
- /**
- * Get the maximum value for the column.
- * @return maximum value
- */
- Timestamp getMaximum();
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java
index c1781ef6a6..136e5a7b5d 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java
@@ -39,6 +39,7 @@
import org.apache.tajo.exception.UnsupportedException;
import org.apache.tajo.storage.StorageConstants;
import org.apache.tajo.unit.TimeUnit;
+import org.apache.tajo.util.datetime.DateTimeConstants;
import org.apache.tajo.util.datetime.DateTimeUtil;
import java.io.EOFException;
@@ -889,7 +890,7 @@ private static int parseNanos(long serialized) {
// borrowed from Facebook's TimestampStreamReader
private static long decodeTimestamp(long seconds, long serializedNanos, long baseTimestampInSeconds) {
- long millis = (seconds + baseTimestampInSeconds) * TimeUnit.MILLIS_PER_SECOND;
+ long millis = (seconds + baseTimestampInSeconds) * DateTimeConstants.MSECS_PER_SEC;
long nanos = parseNanos(serializedNanos);
// the rounding error exists because java always rounds up when dividing integers
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java
index 669b44fbd3..2c85aa6653 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java
@@ -18,6 +18,8 @@
package org.apache.tajo.storage.thirdparty.orc;
+import org.apache.orc.OrcProto;
+import org.apache.orc.StripeInformation;
import org.apache.tajo.storage.Tuple;
import java.io.IOException;
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java
index 4cf008a3a9..032885dece 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java
@@ -19,7 +19,6 @@
package org.apache.tajo.storage.thirdparty.orc;
import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.google.common.primitives.Longs;
import com.google.protobuf.ByteString;
@@ -30,22 +29,20 @@
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.io.IOConstants;
-import org.apache.hadoop.hive.shims.ShimLoader;
-import org.apache.tajo.datum.*;
-import org.apache.tajo.storage.Tuple;
-import org.apache.tajo.storage.thirdparty.orc.CompressionCodec.Modifier;
-import org.apache.tajo.storage.thirdparty.orc.OrcProto.RowIndexEntry;
-import org.apache.tajo.storage.thirdparty.orc.OrcProto.StripeStatistics;
-import org.apache.tajo.storage.thirdparty.orc.OrcProto.Type;
-import org.apache.tajo.storage.thirdparty.orc.OrcProto.UserMetadataItem;
import org.apache.hadoop.hive.ql.util.JavaDataModel;
-import org.apache.hadoop.hive.serde2.objectinspector.*;
-import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.apache.hadoop.io.Text;
-import org.apache.tajo.unit.TimeUnit;
+import org.apache.orc.*;
+import org.apache.orc.CompressionCodec.Modifier;
+import org.apache.orc.OrcProto.RowIndexEntry;
+import org.apache.orc.OrcUtils;
+import org.apache.orc.impl.*;
+import org.apache.tajo.datum.Datum;
+import org.apache.tajo.datum.Inet4Datum;
+import org.apache.tajo.datum.Int4Datum;
+import org.apache.tajo.datum.Int8Datum;
+import org.apache.tajo.storage.Tuple;
+import org.apache.tajo.storage.thirdparty.orc.OrcFile.*;
+import org.apache.tajo.util.datetime.DateTimeConstants;
import org.apache.tajo.util.datetime.DateTimeUtil;
import java.io.IOException;
@@ -95,10 +92,11 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
private final boolean addBlockPadding;
private final int bufferSize;
private final long blockSize;
- private final float paddingTolerance;
+ private final double paddingTolerance;
+ private final TypeDescription schema;
+
// the streams that make up the current stripe
- private final Map streams =
- new TreeMap<>();
+ private final Map streams = new TreeMap<>();
private FSDataOutputStream rawWriter = null;
// the compressed metadata information outStream
@@ -112,47 +110,32 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
private long rawDataSize = 0;
private int rowsInIndex = 0;
private int stripesAtLastFlush = -1;
- private final List stripes =
- new ArrayList<>();
- private final Map userMetadata =
- new TreeMap<>();
+ private final List stripes = new ArrayList<>();
+ private final Map userMetadata = new TreeMap<>();
+ private final StreamFactory streamFactory = new StreamFactory();
private final TreeWriter treeWriter;
private final boolean buildIndex;
private final MemoryManager memoryManager;
- private final OrcFile.Version version;
+ private final Version version;
private final Configuration conf;
- private final OrcFile.WriterCallback callback;
- private final OrcFile.WriterContext callbackContext;
- private final OrcFile.EncodingStrategy encodingStrategy;
- private final OrcFile.CompressionStrategy compressionStrategy;
+ private final WriterCallback callback;
+ private final WriterContext callbackContext;
+ private final EncodingStrategy encodingStrategy;
+ private final CompressionStrategy compressionStrategy;
private final boolean[] bloomFilterColumns;
private final double bloomFilterFpp;
private boolean writeTimeZone;
private TimeZone timeZone;
- WriterImpl(FileSystem fs,
- Path path,
- Configuration conf,
- ObjectInspector inspector,
- long stripeSize,
- CompressionKind compress,
- int bufferSize,
- int rowIndexStride,
- MemoryManager memoryManager,
- boolean addBlockPadding,
- OrcFile.Version version,
- OrcFile.WriterCallback callback,
- OrcFile.EncodingStrategy encodingStrategy,
- OrcFile.CompressionStrategy compressionStrategy,
- float paddingTolerance,
- long blockSizeValue,
- String bloomFilterColumnNames,
- double bloomFilterFpp,
- TimeZone timeZone) throws IOException {
+ public WriterImpl(FileSystem fs,
+ Path path,
+ OrcFile.WriterOptions opts,
+ TimeZone timeZone) throws IOException {
this.fs = fs;
this.path = path;
- this.conf = conf;
- this.callback = callback;
+ this.conf = opts.getConfiguration();
+ this.callback = opts.getCallback();
+ this.schema = opts.getSchema();
if (callback != null) {
callbackContext = new OrcFile.WriterContext(){
@@ -164,100 +147,60 @@ public Writer getWriter() {
} else {
callbackContext = null;
}
- this.adjustedStripeSize = stripeSize;
- this.defaultStripeSize = stripeSize;
- this.version = version;
- this.encodingStrategy = encodingStrategy;
- this.compressionStrategy = compressionStrategy;
- this.addBlockPadding = addBlockPadding;
- this.blockSize = blockSizeValue;
- this.paddingTolerance = paddingTolerance;
- this.compress = compress;
- this.rowIndexStride = rowIndexStride;
- this.memoryManager = memoryManager;
- this.timeZone = timeZone;
+ this.adjustedStripeSize = opts.getStripeSize();
+ this.defaultStripeSize = opts.getStripeSize();
+ this.version = opts.getVersion();
+ this.encodingStrategy = opts.getEncodingStrategy();
+ this.compressionStrategy = opts.getCompressionStrategy();
+ this.addBlockPadding = opts.getBlockPadding();
+ this.blockSize = opts.getBlockSize();
+ this.paddingTolerance = opts.getPaddingTolerance();
+ this.compress = opts.getCompress();
+ this.rowIndexStride = opts.getRowIndexStride();
+ this.memoryManager = opts.getMemoryManager();
buildIndex = rowIndexStride > 0;
codec = createCodec(compress);
- String allColumns = conf.get(IOConstants.COLUMNS);
- if (allColumns == null) {
- allColumns = getColumnNamesFromInspector(inspector);
- }
- this.bufferSize = getEstimatedBufferSize(allColumns, bufferSize);
+ int numColumns = schema.getMaximumId() + 1;
+ this.bufferSize = getEstimatedBufferSize(defaultStripeSize,
+ numColumns, opts.getBufferSize());
if (version == OrcFile.Version.V_0_11) {
/* do not write bloom filters for ORC v11 */
- this.bloomFilterColumns =
- OrcUtils.includeColumns(null, allColumns, inspector);
+ this.bloomFilterColumns = new boolean[schema.getMaximumId() + 1];
} else {
this.bloomFilterColumns =
- OrcUtils.includeColumns(bloomFilterColumnNames, allColumns, inspector);
+ OrcUtils.includeColumns(opts.getBloomFilterColumns(), schema);
}
- this.bloomFilterFpp = bloomFilterFpp;
- treeWriter = createTreeWriter(inspector, new StreamFactory(), false);
+ this.bloomFilterFpp = opts.getBloomFilterFpp();
+ this.timeZone = timeZone;
+ treeWriter = createTreeWriter(schema, streamFactory, false);
if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) {
throw new IllegalArgumentException("Row stride must be at least " +
MIN_ROW_INDEX_STRIDE);
}
// ensure that we are able to handle callbacks before we register ourselves
- memoryManager.addWriter(path, stripeSize, this);
- }
-
- private String getColumnNamesFromInspector(ObjectInspector inspector) {
- List fieldNames = Lists.newArrayList();
- Joiner joiner = Joiner.on(",");
- if (inspector instanceof StructObjectInspector) {
- StructObjectInspector soi = (StructObjectInspector) inspector;
- List extends StructField> fields = soi.getAllStructFieldRefs();
- for(StructField sf : fields) {
- fieldNames.add(sf.getFieldName());
- }
- }
- return joiner.join(fieldNames);
+ memoryManager.addWriter(path, opts.getStripeSize(), this);
}
@VisibleForTesting
- int getEstimatedBufferSize(int bs) {
- return getEstimatedBufferSize(conf.get(IOConstants.COLUMNS), bs);
- }
-
- int getEstimatedBufferSize(String colNames, int bs) {
- long availableMem = getMemoryAvailableForORC();
- if (colNames != null) {
- final int numCols = colNames.split(",").length;
- if (numCols > COLUMN_COUNT_THRESHOLD) {
- // In BufferedStream, there are 3 outstream buffers (compressed,
- // uncompressed and overflow) and list of previously compressed buffers.
- // Since overflow buffer is rarely used, lets consider only 2 allocation.
- // Also, initially, the list of compression buffers will be empty.
- final int outStreamBuffers = codec == null ? 1 : 2;
-
- // max possible streams per column is 5. For string columns, there is
- // ROW_INDEX, PRESENT, DATA, LENGTH, DICTIONARY_DATA streams.
- final int maxStreams = 5;
-
- // Lets assume 10% memory for holding dictionary in memory and other
- // object allocations
- final long miscAllocation = (long) (0.1f * availableMem);
-
- // compute the available memory
- final long remainingMem = availableMem - miscAllocation;
-
- int estBufferSize = (int) (remainingMem /
- (maxStreams * outStreamBuffers * numCols));
- estBufferSize = getClosestBufferSize(estBufferSize, bs);
- if (estBufferSize > bs) {
- estBufferSize = bs;
- }
-
- LOG.info("WIDE TABLE - Number of columns: " + numCols +
- " Chosen compression buffer size: " + estBufferSize);
- return estBufferSize;
- }
+ public static int getEstimatedBufferSize(long stripeSize, int numColumns,
+ int bs) {
+ // The worst case is that there are 2 big streams per a column and
+ // we want to guarantee that each stream gets ~10 buffers.
+ // This keeps buffers small enough that we don't get really small stripe
+ // sizes.
+ int estBufferSize = (int) (stripeSize / (20 * numColumns));
+ estBufferSize = getClosestBufferSize(estBufferSize);
+ if (estBufferSize > bs) {
+ estBufferSize = bs;
+ } else {
+ LOG.info("WIDE TABLE - Number of columns: " + numColumns +
+ " Chosen compression buffer size: " + estBufferSize);
}
- return bs;
+ return estBufferSize;
}
- private int getClosestBufferSize(int estBufferSize, int bs) {
+ private static int getClosestBufferSize(int estBufferSize) {
final int kb4 = 4 * 1024;
final int kb8 = 8 * 1024;
final int kb16 = 16 * 1024;
@@ -617,8 +560,7 @@ public TimeZone getTimeZone() {
*/
private abstract static class TreeWriter {
protected final int id;
- protected final ObjectInspector inspector;
- private final BitFieldWriter isPresent;
+ protected final BitFieldWriter isPresent;
private final boolean isCompressed;
protected final ColumnStatisticsImpl indexStatistics;
protected final ColumnStatisticsImpl stripeColStatistics;
@@ -635,24 +577,24 @@ private abstract static class TreeWriter {
private final OrcProto.BloomFilter.Builder bloomFilterEntry;
private boolean foundNulls;
private OutStream isPresentOutStream;
- private final List stripeStatsBuilders;
+ private final List stripeStatsBuilders;
private final StreamFactory streamFactory;
/**
* Create a tree writer.
* @param columnId the column id of the column to write
- * @param inspector the object inspector to use
+ * @param schema the row schema
* @param streamFactory limited access to the Writer's data.
* @param nullable can the value be null?
* @throws IOException
*/
- TreeWriter(int columnId, ObjectInspector inspector,
+ TreeWriter(int columnId,
+ TypeDescription schema,
StreamFactory streamFactory,
boolean nullable) throws IOException {
this.streamFactory = streamFactory;
this.isCompressed = streamFactory.isCompressed();
this.id = columnId;
- this.inspector = inspector;
if (nullable) {
isPresentOutStream = streamFactory.createStream(id,
OrcProto.Stream.Kind.PRESENT);
@@ -662,9 +604,9 @@ private abstract static class TreeWriter {
}
this.foundNulls = false;
createBloomFilter = streamFactory.getBloomFilterColumns()[columnId];
- indexStatistics = ColumnStatisticsImpl.create(inspector);
- stripeColStatistics = ColumnStatisticsImpl.create(inspector);
- fileStatistics = ColumnStatisticsImpl.create(inspector);
+ indexStatistics = ColumnStatisticsImpl.create(schema);
+ stripeColStatistics = ColumnStatisticsImpl.create(schema);
+ fileStatistics = ColumnStatisticsImpl.create(schema);
childrenWriters = new TreeWriter[0];
rowIndex = OrcProto.RowIndex.newBuilder();
rowIndexEntry = OrcProto.RowIndexEntry.newBuilder();
@@ -913,10 +855,10 @@ private static class BooleanTreeWriter extends TreeWriter {
private final BitFieldWriter writer;
BooleanTreeWriter(int columnId,
- ObjectInspector inspector,
+ TypeDescription schema,
StreamFactory writer,
boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ super(columnId, schema, writer, nullable);
PositionedOutputStream out = writer.createStream(id,
OrcProto.Stream.Kind.DATA);
this.writer = new BitFieldWriter(out, 1);
@@ -928,7 +870,8 @@ void write(Datum datum) throws IOException {
super.write(datum);
if (datum != null && datum.isNotNull()) {
boolean val = datum.asBool();
- indexStatistics.updateBoolean(val);
+ // TODO: validate the below line
+ indexStatistics.updateBoolean(val, 1);
writer.write(val ? 1 : 0);
}
}
@@ -952,10 +895,10 @@ private static class ByteTreeWriter extends TreeWriter {
private final RunLengthByteWriter writer;
ByteTreeWriter(int columnId,
- ObjectInspector inspector,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
this.writer = new RunLengthByteWriter(writer.createStream(id,
OrcProto.Stream.Kind.DATA));
recordPosition(rowIndexPosition);
@@ -966,7 +909,7 @@ void write(Datum datum) throws IOException {
super.write(datum);
if (datum != null && datum.isNotNull()) {
byte val = datum.asByte();
- indexStatistics.updateInteger(val);
+ indexStatistics.updateInteger(val, 1);
if (createBloomFilter) {
bloomFilter.addLong(val);
}
@@ -994,10 +937,10 @@ private static class IntegerTreeWriter extends TreeWriter {
private boolean isDirectV2 = true;
IntegerTreeWriter(int columnId,
- ObjectInspector inspector,
+ TypeDescription schema,
StreamFactory writer,
boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ super(columnId, schema, writer, nullable);
OutStream out = writer.createStream(id,
OrcProto.Stream.Kind.DATA);
this.isDirectV2 = isNewWriteFormat(writer);
@@ -1027,7 +970,7 @@ void write(Datum datum) throws IOException {
} else {
val = datum.asInt2();
}
- indexStatistics.updateInteger(val);
+ indexStatistics.updateInteger(val, 1);
if (createBloomFilter) {
// integers are converted to longs in column statistics and during SARG evaluation
bloomFilter.addLong(val);
@@ -1056,10 +999,10 @@ private static class FloatTreeWriter extends TreeWriter {
private final SerializationUtils utils;
FloatTreeWriter(int columnId,
- ObjectInspector inspector,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
this.stream = writer.createStream(id,
OrcProto.Stream.Kind.DATA);
this.utils = new SerializationUtils();
@@ -1100,10 +1043,10 @@ private static class DoubleTreeWriter extends TreeWriter {
private final SerializationUtils utils;
DoubleTreeWriter(int columnId,
- ObjectInspector inspector,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
this.stream = writer.createStream(id,
OrcProto.Stream.Kind.DATA);
this.utils = new SerializationUtils();
@@ -1138,33 +1081,33 @@ void recordPosition(PositionRecorder recorder) throws IOException {
}
}
- private static class StringTreeWriter extends TreeWriter {
+ private static abstract class StringBaseTreeWriter extends TreeWriter {
private static final int INITIAL_DICTIONARY_SIZE = 4096;
private final OutStream stringOutput;
private final IntegerWriter lengthOutput;
private final IntegerWriter rowOutput;
- private final StringRedBlackTree dictionary =
+ protected final StringRedBlackTree dictionary =
new StringRedBlackTree(INITIAL_DICTIONARY_SIZE);
- private final DynamicIntArray rows = new DynamicIntArray();
- private final PositionedOutputStream directStreamOutput;
- private final IntegerWriter directLengthOutput;
- private final List savedRowIndex =
- new ArrayList<>();
+ protected final DynamicIntArray rows = new DynamicIntArray();
+ protected final PositionedOutputStream directStreamOutput;
+ protected final IntegerWriter directLengthOutput;
+ private final List savedRowIndex =
+ new ArrayList();
private final boolean buildIndex;
- private final List rowIndexValueCount = new ArrayList<>();
+ private final List rowIndexValueCount = new ArrayList();
// If the number of keys in a dictionary is greater than this fraction of
//the total number of non-null rows, turn off dictionary encoding
- private final float dictionaryKeySizeThreshold;
- private boolean useDictionaryEncoding = true;
+ private final double dictionaryKeySizeThreshold;
+ protected boolean useDictionaryEncoding = true;
private boolean isDirectV2 = true;
private boolean doneDictionaryCheck;
- private final boolean strideDictionaryCheck;
+ protected final boolean strideDictionaryCheck;
- StringTreeWriter(int columnId,
- ObjectInspector inspector,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ StringBaseTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
this.isDirectV2 = isNewWriteFormat(writer);
stringOutput = writer.createStream(id,
OrcProto.Stream.Kind.DICTIONARY_DATA);
@@ -1178,33 +1121,14 @@ private static class StringTreeWriter extends TreeWriter {
directStreamOutput = writer.createStream(id, OrcProto.Stream.Kind.DATA);
directLengthOutput = createIntegerWriter(writer.createStream(id,
OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer);
- dictionaryKeySizeThreshold = writer.getConfiguration().getFloat(
- OrcConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname,
- OrcConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.defaultFloatVal);
- strideDictionaryCheck = writer.getConfiguration().getBoolean(
- OrcConf.ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname,
- OrcConf.ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.defaultBoolVal);
+ Configuration conf = writer.getConfiguration();
+ dictionaryKeySizeThreshold =
+ org.apache.orc.OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getDouble(conf);
+ strideDictionaryCheck =
+ org.apache.orc.OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getBoolean(conf);
doneDictionaryCheck = false;
}
- @Override
- void write(Datum datum) throws IOException {
- super.write(datum);
- if (datum != null && datum.isNotNull()) {
- if (useDictionaryEncoding || !strideDictionaryCheck) {
- rows.add(dictionary.add(datum.toString()));
- } else {
- // write data and length
- directStreamOutput.write(datum.asByteArray(), 0, datum.size());
- directLengthOutput.write(datum.size());
- }
- indexStatistics.updateString(datum.toString());
- if (createBloomFilter) {
- bloomFilter.addBytes(datum.asByteArray(), datum.size());
- }
- }
- }
-
private boolean checkDictionaryEncoding() {
if (!doneDictionaryCheck) {
// Set the flag indicating whether or not to use dictionary encoding
@@ -1270,7 +1194,7 @@ private void flushDictionary() throws IOException {
private int currentId = 0;
@Override
public void visit(StringRedBlackTree.VisitorContext context
- ) throws IOException {
+ ) throws IOException {
context.writeBytes(stringOutput);
lengthOutput.write(context.getLength());
dumpOrder[context.getOriginalPosition()] = currentId++;
@@ -1384,29 +1308,76 @@ long estimateMemory() {
}
}
+ private static class StringTreeWriter extends StringBaseTreeWriter {
+ StringTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ }
+
+ @Override
+ void write(Datum datum) throws IOException {
+ super.write(datum);
+ if (datum != null && datum.isNotNull()) {
+ if (useDictionaryEncoding || !strideDictionaryCheck) {
+ rows.add(dictionary.add(datum.toString()));
+ } else {
+ // write data and length
+ directStreamOutput.write(datum.asByteArray(), 0, datum.size());
+ directLengthOutput.write(datum.size());
+ }
+ byte[] buf = datum.asByteArray();
+ indexStatistics.updateString(buf, 0, buf.length, 1);
+ if (createBloomFilter) {
+ bloomFilter.addBytes(buf, 0, buf.length);
+ }
+ }
+ }
+ }
+
/**
* Under the covers, char is written to ORC the same way as string.
*/
private static class CharTreeWriter extends StringTreeWriter {
+ private final int itemLength;
+ private final byte[] padding;
CharTreeWriter(int columnId,
- ObjectInspector inspector,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ itemLength = schema.getMaxLength();
+ padding = new byte[itemLength];
}
- }
- /**
- * Under the covers, varchar is written to ORC the same way as string.
- */
- private static class VarcharTreeWriter extends StringTreeWriter {
+ @Override
+ void write(Datum datum) throws IOException {
+ super.write(datum);
+ if (datum != null && datum.isNotNull()) {
+ byte[] ptr;
+ byte[] buf = datum.asByteArray();
+ if (buf.length >= itemLength) {
+ ptr = buf;
+ } else {
+ ptr = padding;
+ System.arraycopy(buf, 0, ptr, 0, buf.length);
+ Arrays.fill(ptr, buf.length, itemLength, (byte) ' ');
+ }
+ if (useDictionaryEncoding || !strideDictionaryCheck) {
+ rows.add(dictionary.add(ptr, 0, itemLength));
+ } else {
+ // write data and length
+ directStreamOutput.write(ptr, 0, itemLength);
+ directLengthOutput.write(itemLength);
+ }
- VarcharTreeWriter(int columnId,
- ObjectInspector inspector,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ indexStatistics.updateString(ptr, 0, ptr.length, 1);
+ if (createBloomFilter) {
+ bloomFilter.addBytes(ptr, 0, ptr.length);
+ }
+ }
}
}
@@ -1416,10 +1387,10 @@ private static class BinaryTreeWriter extends TreeWriter {
private boolean isDirectV2 = true;
BinaryTreeWriter(int columnId,
- ObjectInspector inspector,
+ TypeDescription schema,
StreamFactory writer,
boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ super(columnId, schema, writer, nullable);
this.stream = writer.createStream(id,
OrcProto.Stream.Kind.DATA);
this.isDirectV2 = isNewWriteFormat(writer);
@@ -1442,11 +1413,12 @@ OrcProto.ColumnEncoding getEncoding() {
void write(Datum datum) throws IOException {
super.write(datum);
if (datum != null && datum.isNotNull()) {
- stream.write(datum.asByteArray(), 0, datum.size());
+ byte[] buf = datum.asByteArray();
+ stream.write(buf, 0, buf.length);
length.write(datum.size());
- indexStatistics.updateBinary(datum);
+ indexStatistics.updateBinary(buf, 0, buf.length, 1);
if (createBloomFilter) {
- bloomFilter.addBytes(datum.asByteArray(), datum.size());
+ bloomFilter.addBytes(buf, 0, buf.length);
}
}
}
@@ -1478,10 +1450,10 @@ private static class TimestampTreeWriter extends TreeWriter {
private TimeZone timeZone;
TimestampTreeWriter(int columnId,
- ObjectInspector inspector,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
this.isDirectV2 = isNewWriteFormat(writer);
this.seconds = createIntegerWriter(writer.createStream(id,
OrcProto.Stream.Kind.DATA), true, isDirectV2, writer);
@@ -1489,7 +1461,7 @@ private static class TimestampTreeWriter extends TreeWriter {
OrcProto.Stream.Kind.SECONDARY), false, isDirectV2, writer);
recordPosition(rowIndexPosition);
// for unit tests to set different time zones
- this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / TimeUnit.MILLIS_PER_SECOND;
+ this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / DateTimeConstants.MSECS_PER_SEC;
writer.useWriterTimeZone(true);
timeZone = writer.getTimeZone();
}
@@ -1515,7 +1487,7 @@ void write(Datum datum) throws IOException {
Timestamp val = new Timestamp(javaTimestamp);
indexStatistics.updateTimestamp(val);
- seconds.write((val.getTime() / TimeUnit.MILLIS_PER_SECOND) - base_timestamp);
+ seconds.write((val.getTime() / DateTimeConstants.MSECS_PER_SEC) - base_timestamp);
nanos.write(formatNanos(val.getNanos()));
if (createBloomFilter) {
bloomFilter.addLong(val.getTime());
@@ -1561,12 +1533,12 @@ private static class DateTreeWriter extends TreeWriter {
private final boolean isDirectV2;
DateTreeWriter(int columnId,
- ObjectInspector inspector,
+ TypeDescription schema,
StreamFactory writer,
boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
+ super(columnId, schema, writer, nullable);
OutStream out = writer.createStream(id,
- OrcProto.Stream.Kind.DATA);
+ OrcProto.Stream.Kind.DATA);
this.isDirectV2 = isNewWriteFormat(writer);
this.writer = createIntegerWriter(out, true, isDirectV2, writer);
recordPosition(rowIndexPosition);
@@ -1612,19 +1584,17 @@ OrcProto.ColumnEncoding getEncoding() {
}
private static class StructTreeWriter extends TreeWriter {
- private final List extends StructField> fields;
StructTreeWriter(int columnId,
- ObjectInspector inspector,
+ TypeDescription schema,
StreamFactory writer,
boolean nullable) throws IOException {
- super(columnId, inspector, writer, nullable);
- StructObjectInspector structObjectInspector =
- (StructObjectInspector) inspector;
- fields = structObjectInspector.getAllStructFieldRefs();
- childrenWriters = new TreeWriter[fields.size()];
+ super(columnId, schema, writer, nullable);
+ List children = schema.getChildren();
+ childrenWriters = new TreeWriter[children.size()];
for(int i=0; i < childrenWriters.length; ++i) {
childrenWriters[i] = createTreeWriter(
- fields.get(i).getFieldObjectInspector(), writer, true);
+ children.get(i), writer,
+ true);
}
recordPosition(rowIndexPosition);
}
@@ -1636,9 +1606,8 @@ void write(Datum datum) throws IOException {
void writeTuple(Tuple tuple) throws IOException {
super.write(tuple);
if (tuple != null) {
- for(int i = 0; i < fields.size(); ++i) {
- TreeWriter writer = childrenWriters[i];
- writer.write(tuple.asDatum(i));
+ for(int i = 0; i < childrenWriters.length; ++i) {
+ childrenWriters[i].write(tuple.asDatum(i));
}
}
}
@@ -1654,159 +1623,136 @@ void writeStripe(OrcProto.StripeFooter.Builder builder,
}
}
- private static TreeWriter createTreeWriter(ObjectInspector inspector,
+ private static TreeWriter createTreeWriter(TypeDescription schema,
StreamFactory streamFactory,
boolean nullable) throws IOException {
- switch (inspector.getCategory()) {
- case PRIMITIVE:
- switch (((PrimitiveObjectInspector) inspector).getPrimitiveCategory()) {
- case BOOLEAN:
- case VOID:
- return new BooleanTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case BYTE:
- return new ByteTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case SHORT:
- case INT:
- case LONG:
- return new IntegerTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case FLOAT:
- return new FloatTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case DOUBLE:
- return new DoubleTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case STRING:
- return new StringTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case CHAR:
- return new CharTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case VARCHAR:
- return new VarcharTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case BINARY:
- return new BinaryTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case TIMESTAMP:
- return new TimestampTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- case DATE:
- return new DateTreeWriter(streamFactory.getNextColumnId(),
- inspector, streamFactory, nullable);
- default:
- throw new IllegalArgumentException("Bad primitive category " +
- ((PrimitiveObjectInspector) inspector).getPrimitiveCategory());
- }
+ switch (schema.getCategory()) {
+ case BOOLEAN:
+ return new BooleanTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case BYTE:
+ return new ByteTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case SHORT:
+ case INT:
+ case LONG:
+ return new IntegerTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case FLOAT:
+ return new FloatTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case DOUBLE:
+ return new DoubleTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case STRING:
+ return new StringTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case CHAR:
+ return new CharTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case BINARY:
+ return new BinaryTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case TIMESTAMP:
+ return new TimestampTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case DATE:
+ return new DateTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
case STRUCT:
- return new StructTreeWriter(streamFactory.getNextColumnId(), inspector,
- streamFactory, nullable);
+ return new StructTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
default:
throw new IllegalArgumentException("Bad category: " +
- inspector.getCategory());
+ schema.getCategory());
}
}
private static void writeTypes(OrcProto.Footer.Builder builder,
- TreeWriter treeWriter) {
+ TypeDescription schema) {
OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
- switch (treeWriter.inspector.getCategory()) {
- case PRIMITIVE:
- switch (((PrimitiveObjectInspector) treeWriter.inspector).
- getPrimitiveCategory()) {
- case VOID:
- case BOOLEAN:
- type.setKind(OrcProto.Type.Kind.BOOLEAN);
- break;
- case BYTE:
- type.setKind(OrcProto.Type.Kind.BYTE);
- break;
- case SHORT:
- type.setKind(OrcProto.Type.Kind.SHORT);
- break;
- case INT:
- type.setKind(OrcProto.Type.Kind.INT);
- break;
- case LONG:
- type.setKind(OrcProto.Type.Kind.LONG);
- break;
- case FLOAT:
- type.setKind(OrcProto.Type.Kind.FLOAT);
- break;
- case DOUBLE:
- type.setKind(OrcProto.Type.Kind.DOUBLE);
- break;
- case STRING:
- type.setKind(OrcProto.Type.Kind.STRING);
- break;
- case CHAR:
- // The char length needs to be written to file and should be available
- // from the object inspector
- CharTypeInfo charTypeInfo = (CharTypeInfo) ((PrimitiveObjectInspector) treeWriter.inspector).getTypeInfo();
- type.setKind(Type.Kind.CHAR);
- type.setMaximumLength(charTypeInfo.getLength());
- break;
- case VARCHAR:
- // The varchar length needs to be written to file and should be available
- // from the object inspector
- VarcharTypeInfo typeInfo = (VarcharTypeInfo) ((PrimitiveObjectInspector) treeWriter.inspector).getTypeInfo();
- type.setKind(Type.Kind.VARCHAR);
- type.setMaximumLength(typeInfo.getLength());
- break;
- case BINARY:
- type.setKind(OrcProto.Type.Kind.BINARY);
- break;
- case TIMESTAMP:
- type.setKind(OrcProto.Type.Kind.TIMESTAMP);
- break;
- case DATE:
- type.setKind(OrcProto.Type.Kind.DATE);
- break;
- case DECIMAL:
- DecimalTypeInfo decTypeInfo = (DecimalTypeInfo)((PrimitiveObjectInspector)treeWriter.inspector).getTypeInfo();
- type.setKind(OrcProto.Type.Kind.DECIMAL);
- type.setPrecision(decTypeInfo.precision());
- type.setScale(decTypeInfo.scale());
- break;
- default:
- throw new IllegalArgumentException("Unknown primitive category: " +
- ((PrimitiveObjectInspector) treeWriter.inspector).
- getPrimitiveCategory());
- }
+ List children = schema.getChildren();
+ switch (schema.getCategory()) {
+ case BOOLEAN:
+ type.setKind(OrcProto.Type.Kind.BOOLEAN);
+ break;
+ case BYTE:
+ type.setKind(OrcProto.Type.Kind.BYTE);
+ break;
+ case SHORT:
+ type.setKind(OrcProto.Type.Kind.SHORT);
+ break;
+ case INT:
+ type.setKind(OrcProto.Type.Kind.INT);
+ break;
+ case LONG:
+ type.setKind(OrcProto.Type.Kind.LONG);
+ break;
+ case FLOAT:
+ type.setKind(OrcProto.Type.Kind.FLOAT);
+ break;
+ case DOUBLE:
+ type.setKind(OrcProto.Type.Kind.DOUBLE);
+ break;
+ case STRING:
+ type.setKind(OrcProto.Type.Kind.STRING);
+ break;
+ case CHAR:
+ type.setKind(OrcProto.Type.Kind.CHAR);
+ type.setMaximumLength(schema.getMaxLength());
+ break;
+ case VARCHAR:
+ type.setKind(OrcProto.Type.Kind.VARCHAR);
+ type.setMaximumLength(schema.getMaxLength());
+ break;
+ case BINARY:
+ type.setKind(OrcProto.Type.Kind.BINARY);
+ break;
+ case TIMESTAMP:
+ type.setKind(OrcProto.Type.Kind.TIMESTAMP);
+ break;
+ case DATE:
+ type.setKind(OrcProto.Type.Kind.DATE);
+ break;
+ case DECIMAL:
+ type.setKind(OrcProto.Type.Kind.DECIMAL);
+ type.setPrecision(schema.getPrecision());
+ type.setScale(schema.getScale());
break;
case LIST:
type.setKind(OrcProto.Type.Kind.LIST);
- type.addSubtypes(treeWriter.childrenWriters[0].id);
+ type.addSubtypes(children.get(0).getId());
break;
case MAP:
type.setKind(OrcProto.Type.Kind.MAP);
- type.addSubtypes(treeWriter.childrenWriters[0].id);
- type.addSubtypes(treeWriter.childrenWriters[1].id);
+ for(TypeDescription t: children) {
+ type.addSubtypes(t.getId());
+ }
break;
case STRUCT:
type.setKind(OrcProto.Type.Kind.STRUCT);
- for(TreeWriter child: treeWriter.childrenWriters) {
- type.addSubtypes(child.id);
+ for(TypeDescription t: children) {
+ type.addSubtypes(t.getId());
}
- for(StructField field: ((StructTreeWriter) treeWriter).fields) {
- type.addFieldNames(field.getFieldName());
+ for(String field: schema.getFieldNames()) {
+ type.addFieldNames(field);
}
break;
case UNION:
type.setKind(OrcProto.Type.Kind.UNION);
- for(TreeWriter child: treeWriter.childrenWriters) {
- type.addSubtypes(child.id);
+ for(TypeDescription t: children) {
+ type.addSubtypes(t.getId());
}
break;
default:
throw new IllegalArgumentException("Unknown category: " +
- treeWriter.inspector.getCategory());
+ schema.getCategory());
}
builder.addTypes(type);
- for(TreeWriter child: treeWriter.childrenWriters) {
- writeTypes(builder, child);
+ if (children != null) {
+ for(TypeDescription child: children) {
+ writeTypes(builder, child);
+ }
}
}
@@ -1853,9 +1799,9 @@ private void flushStripe() throws IOException {
StreamName name = pair.getKey();
long streamSize = pair.getValue().getOutputSize();
builder.addStreams(OrcProto.Stream.newBuilder()
- .setColumn(name.getColumn())
- .setKind(name.getKind())
- .setLength(streamSize));
+ .setColumn(name.getColumn())
+ .setKind(name.getKind())
+ .setLength(streamSize));
if (StreamName.Area.INDEX == name.getArea()) {
indexSize += streamSize;
} else {
@@ -1880,8 +1826,8 @@ private void flushStripe() throws IOException {
// and user specified padding tolerance. Since stripe size can overflow
// the default stripe size we should apply this correction to avoid
// writing portion of last stripe to next hdfs block.
- float correction = overflow > 0 ? (float) overflow
- / (float) adjustedStripeSize : 0.0f;
+ double correction = overflow > 0 ? (double) overflow
+ / (double) adjustedStripeSize : 0.0;
// correction should not be greater than user specified padding
// tolerance
@@ -1939,75 +1885,60 @@ private void flushStripe() throws IOException {
}
private long computeRawDataSize() {
- long result = 0;
- for (TreeWriter child : treeWriter.getChildrenWriters()) {
- result += getRawDataSizeFromInspectors(child, child.inspector);
- }
- return result;
+ return getRawDataSize(treeWriter, schema);
}
- private long getRawDataSizeFromInspectors(TreeWriter child, ObjectInspector oi) {
+ private long getRawDataSize(TreeWriter child,
+ TypeDescription schema) {
long total = 0;
- switch (oi.getCategory()) {
- case PRIMITIVE:
- total += getRawDataSizeFromPrimitives(child, oi);
- break;
- case LIST:
- case MAP:
- case UNION:
- case STRUCT:
- for (TreeWriter tw : child.childrenWriters) {
- total += getRawDataSizeFromInspectors(tw, tw.inspector);
+ long numVals = child.fileStatistics.getNumberOfValues();
+ switch (schema.getCategory()) {
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case FLOAT:
+ return numVals * JavaDataModel.get().primitive1();
+ case LONG:
+ case DOUBLE:
+ return numVals * JavaDataModel.get().primitive2();
+ case STRING:
+ case VARCHAR:
+ case CHAR:
+ // ORC strings are converted to java Strings. so use JavaDataModel to
+ // compute the overall size of strings
+ StringColumnStatistics scs = (StringColumnStatistics) child.fileStatistics;
+ numVals = numVals == 0 ? 1 : numVals;
+ int avgStringLen = (int) (scs.getSum() / numVals);
+ return numVals * JavaDataModel.get().lengthForStringOfLength(avgStringLen);
+ case DECIMAL:
+ return numVals * JavaDataModel.get().lengthOfDecimal();
+ case DATE:
+ return numVals * JavaDataModel.get().lengthOfDate();
+ case BINARY:
+ // get total length of binary blob
+ BinaryColumnStatistics bcs = (BinaryColumnStatistics) child.fileStatistics;
+ return bcs.getSum();
+ case TIMESTAMP:
+ return numVals * JavaDataModel.get().lengthOfTimestamp();
+ case LIST:
+ case MAP:
+ case UNION:
+ case STRUCT: {
+ TreeWriter[] childWriters = child.getChildrenWriters();
+ List childTypes = schema.getChildren();
+ for (int i=0; i < childWriters.length; ++i) {
+ total += getRawDataSize(childWriters[i], childTypes.get(i));
+ }
+ break;
}
- break;
- default:
- LOG.debug("Unknown object inspector category.");
- break;
+ default:
+ LOG.debug("Unknown object inspector category.");
+ break;
}
return total;
}
- private long getRawDataSizeFromPrimitives(TreeWriter child, ObjectInspector oi) {
- long result = 0;
- long numVals = child.fileStatistics.getNumberOfValues();
- switch (((PrimitiveObjectInspector) oi).getPrimitiveCategory()) {
- case BOOLEAN:
- case BYTE:
- case SHORT:
- case INT:
- case FLOAT:
- return numVals * JavaDataModel.get().primitive1();
- case LONG:
- case DOUBLE:
- return numVals * JavaDataModel.get().primitive2();
- case STRING:
- case VARCHAR:
- case CHAR:
- // ORC strings are converted to java Strings. so use JavaDataModel to
- // compute the overall size of strings
- child = (StringTreeWriter) child;
- StringColumnStatistics scs = (StringColumnStatistics) child.fileStatistics;
- numVals = numVals == 0 ? 1 : numVals;
- int avgStringLen = (int) (scs.getSum() / numVals);
- return numVals * JavaDataModel.get().lengthForStringOfLength(avgStringLen);
- case DECIMAL:
- return numVals * JavaDataModel.get().lengthOfDecimal();
- case DATE:
- return numVals * JavaDataModel.get().lengthOfDate();
- case BINARY:
- // get total length of binary blob
- BinaryColumnStatistics bcs = (BinaryColumnStatistics) child.fileStatistics;
- return bcs.getSum();
- case TIMESTAMP:
- return numVals * JavaDataModel.get().lengthOfTimestamp();
- default:
- LOG.debug("Unknown primitive category.");
- break;
- }
-
- return result;
- }
-
private OrcProto.CompressionKind writeCompressionKind(CompressionKind kind) {
switch (kind) {
case NONE: return OrcProto.CompressionKind.NONE;
@@ -2027,7 +1958,7 @@ private void writeFileStatistics(OrcProto.Footer.Builder builder,
}
}
- private int writeMetadata(long bodyLength) throws IOException {
+ private int writeMetadata() throws IOException {
getStream();
OrcProto.Metadata.Builder builder = OrcProto.Metadata.newBuilder();
for(OrcProto.StripeStatistics.Builder ssb : treeWriter.stripeStatsBuilders) {
@@ -2052,7 +1983,7 @@ private int writeFooter(long bodyLength) throws IOException {
// populate raw data size
rawDataSize = computeRawDataSize();
// serialize the types
- writeTypes(builder, treeWriter);
+ writeTypes(builder, schema);
// add the stripe information
for(OrcProto.StripeInformation stripe: stripes) {
builder.addStripes(stripe);
@@ -2062,7 +1993,7 @@ private int writeFooter(long bodyLength) throws IOException {
// add all of the user metadata
for(Map.Entry entry: userMetadata.entrySet()) {
builder.addMetadata(OrcProto.UserMetadataItem.newBuilder()
- .setName(entry.getKey()).setValue(entry.getValue()));
+ .setName(entry.getKey()).setValue(entry.getValue()));
}
long startPosn = rawWriter.getPos();
OrcProto.Footer footer = builder.build();
@@ -2074,14 +2005,14 @@ private int writeFooter(long bodyLength) throws IOException {
private int writePostScript(int footerLength, int metadataLength) throws IOException {
OrcProto.PostScript.Builder builder =
- OrcProto.PostScript.newBuilder()
- .setCompression(writeCompressionKind(compress))
- .setFooterLength(footerLength)
- .setMetadataLength(metadataLength)
- .setMagic(OrcFile.MAGIC)
- .addVersion(version.getMajor())
- .addVersion(version.getMinor())
- .setWriterVersion(OrcFile.WriterVersion.HIVE_8732.getId());
+ OrcProto.PostScript.newBuilder()
+ .setCompression(writeCompressionKind(compress))
+ .setFooterLength(footerLength)
+ .setMetadataLength(metadataLength)
+ .setMagic(OrcFile.MAGIC)
+ .addVersion(version.getMajor())
+ .addVersion(version.getMinor())
+ .setWriterVersion(OrcFile.CURRENT_WRITER.getId());
if (compress != CompressionKind.NONE) {
builder.setCompressionBlockSize(bufferSize);
}
@@ -2120,7 +2051,7 @@ public void addTuple(Tuple tuple) throws IOException {
createRowIndexEntry();
}
}
- memoryManager.addedRow();
+ memoryManager.addedRow(1);
}
@Override
@@ -2132,7 +2063,7 @@ public void close() throws IOException {
memoryManager.removeWriter(path);
// actually close the file
flushStripe();
- int metadataLength = writeMetadata(rawWriter.getPos());
+ int metadataLength = writeMetadata();
int footerLength = writeFooter(rawWriter.getPos() - metadataLength);
rawWriter.writeByte(writePostScript(footerLength, metadataLength));
rawWriter.close();
@@ -2165,19 +2096,19 @@ public long writeIntermediateFooter() throws IOException {
if (callback != null) {
callback.preFooterWrite(callbackContext);
}
- int metaLength = writeMetadata(rawWriter.getPos());
+ int metaLength = writeMetadata();
int footLength = writeFooter(rawWriter.getPos() - metaLength);
rawWriter.writeByte(writePostScript(footLength, metaLength));
stripesAtLastFlush = stripes.size();
- ShimLoader.getHadoopShims().hflush(rawWriter);
+ rawWriter.hflush();
}
return rawWriter.getPos();
}
@Override
public void appendStripe(byte[] stripe, int offset, int length,
- StripeInformation stripeInfo,
- OrcProto.StripeStatistics stripeStatistics) throws IOException {
+ StripeInformation stripeInfo,
+ OrcProto.StripeStatistics stripeStatistics) throws IOException {
checkArgument(stripe != null, "Stripe must not be null");
checkArgument(length <= stripe.length,
"Specified length must not be greater specified array length");
@@ -2187,12 +2118,11 @@ public void appendStripe(byte[] stripe, int offset, int length,
getStream();
long start = rawWriter.getPos();
- long stripeLen = length;
long availBlockSpace = blockSize - (start % blockSize);
// see if stripe can fit in the current hdfs block, else pad the remaining
// space in the block
- if (stripeLen < blockSize && stripeLen > availBlockSpace &&
+ if (length < blockSize && length > availBlockSpace &&
addBlockPadding) {
byte[] pad = new byte[(int) Math.min(HDFS_BUFFER_SIZE, availBlockSpace)];
LOG.info(String.format("Padding ORC by %d bytes while merging..",
@@ -2245,7 +2175,7 @@ private List getAllColumnTreeWriters(TreeWriter rootTreeWriter) {
}
private void getAllColumnTreeWritersImpl(TreeWriter tw,
- List result) {
+ List result) {
result.add(tw);
for (TreeWriter child : tw.childrenWriters) {
getAllColumnTreeWritersImpl(child, result);
@@ -2253,9 +2183,9 @@ private void getAllColumnTreeWritersImpl(TreeWriter tw,
}
@Override
- public void appendUserMetadata(List userMetadata) {
+ public void appendUserMetadata(List userMetadata) {
if (userMetadata != null) {
- for (UserMetadataItem item : userMetadata) {
+ for (OrcProto.UserMetadataItem item : userMetadata) {
this.userMetadata.put(item.getName(), item.getValue());
}
}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZlibCodec.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZlibCodec.java
deleted file mode 100644
index d0a8fa7da3..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZlibCodec.java
+++ /dev/null
@@ -1,169 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tajo.storage.thirdparty.orc;
-
-import org.apache.hadoop.hive.shims.HadoopShims.DirectCompressionType;
-import org.apache.hadoop.hive.shims.HadoopShims.DirectDecompressorShim;
-import org.apache.hadoop.hive.shims.ShimLoader;
-
-import javax.annotation.Nullable;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.EnumSet;
-import java.util.zip.DataFormatException;
-import java.util.zip.Deflater;
-import java.util.zip.Inflater;
-
-class ZlibCodec implements CompressionCodec, DirectDecompressionCodec {
-
- private Boolean direct = null;
-
- private final int level;
- private final int strategy;
-
- public ZlibCodec() {
- level = Deflater.DEFAULT_COMPRESSION;
- strategy = Deflater.DEFAULT_STRATEGY;
- }
-
- private ZlibCodec(int level, int strategy) {
- this.level = level;
- this.strategy = strategy;
- }
-
- @Override
- public boolean compress(ByteBuffer in, ByteBuffer out,
- ByteBuffer overflow) throws IOException {
- Deflater deflater = new Deflater(level, true);
- deflater.setStrategy(strategy);
- int length = in.remaining();
- deflater.setInput(in.array(), in.arrayOffset() + in.position(), length);
- deflater.finish();
- int outSize = 0;
- int offset = out.arrayOffset() + out.position();
- while (!deflater.finished() && (length > outSize)) {
- int size = deflater.deflate(out.array(), offset, out.remaining());
- out.position(size + out.position());
- outSize += size;
- offset += size;
- // if we run out of space in the out buffer, use the overflow
- if (out.remaining() == 0) {
- if (overflow == null) {
- deflater.end();
- return false;
- }
- out = overflow;
- offset = out.arrayOffset() + out.position();
- }
- }
- deflater.end();
- return length > outSize;
- }
-
- @Override
- public void decompress(ByteBuffer in, ByteBuffer out) throws IOException {
-
- if(in.isDirect() && out.isDirect()) {
- directDecompress(in, out);
- return;
- }
-
- Inflater inflater = new Inflater(true);
- inflater.setInput(in.array(), in.arrayOffset() + in.position(),
- in.remaining());
- while (!(inflater.finished() || inflater.needsDictionary() ||
- inflater.needsInput())) {
- try {
- int count = inflater.inflate(out.array(),
- out.arrayOffset() + out.position(),
- out.remaining());
- out.position(count + out.position());
- } catch (DataFormatException dfe) {
- throw new IOException("Bad compression data", dfe);
- }
- }
- out.flip();
- inflater.end();
- in.position(in.limit());
- }
-
- @Override
- public boolean isAvailable() {
- if (direct == null) {
- // see nowrap option in new Inflater(boolean) which disables zlib headers
- try {
- if (ShimLoader.getHadoopShims().getDirectDecompressor(
- DirectCompressionType.ZLIB_NOHEADER) != null) {
- direct = Boolean.valueOf(true);
- } else {
- direct = Boolean.valueOf(false);
- }
- } catch (UnsatisfiedLinkError ule) {
- direct = Boolean.valueOf(false);
- }
- }
- return direct.booleanValue();
- }
-
- @Override
- public void directDecompress(ByteBuffer in, ByteBuffer out)
- throws IOException {
- DirectDecompressorShim decompressShim = ShimLoader.getHadoopShims()
- .getDirectDecompressor(DirectCompressionType.ZLIB_NOHEADER);
- decompressShim.decompress(in, out);
- out.flip(); // flip for read
- }
-
- @Override
- public CompressionCodec modify(@Nullable EnumSet modifiers) {
-
- if (modifiers == null) {
- return this;
- }
-
- int l = this.level;
- int s = this.strategy;
-
- for (Modifier m : modifiers) {
- switch (m) {
- case BINARY:
- /* filtered == less LZ77, more huffman */
- s = Deflater.FILTERED;
- break;
- case TEXT:
- s = Deflater.DEFAULT_STRATEGY;
- break;
- case FASTEST:
- // deflate_fast looking for 8 byte patterns
- l = Deflater.BEST_SPEED;
- break;
- case FAST:
- // deflate_fast looking for 16 byte patterns
- l = Deflater.BEST_SPEED + 1;
- break;
- case DEFAULT:
- // deflate_slow looking for 128 byte patterns
- l = Deflater.DEFAULT_COMPRESSION;
- break;
- default:
- break;
- }
- }
- return new ZlibCodec(l, s);
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto b/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto
index c80cf6c269..9da4b5d889 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto
+++ b/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto
@@ -16,7 +16,9 @@
* limitations under the License.
*/
-package org.apache.tajo.storage.thirdparty.orc;
+package orc.proto;
+
+option java_package = "org.apache.orc";
message IntegerStatistics {
optional sint64 minimum = 1;
diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java
index cc3f46399b..608d066913 100644
--- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java
+++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java
@@ -27,6 +27,7 @@
import org.apache.hadoop.io.compress.*;
import org.apache.hadoop.io.compress.zlib.ZlibFactory;
import org.apache.hadoop.util.NativeCodeLoader;
+import org.apache.orc.OrcConf;
import org.apache.tajo.BuiltinStorages;
import org.apache.tajo.catalog.CatalogUtil;
import org.apache.tajo.catalog.Schema;
@@ -38,7 +39,6 @@
import org.apache.tajo.storage.fragment.FileFragment;
import org.apache.tajo.storage.sequencefile.SequenceFileScanner;
import org.apache.tajo.storage.text.DelimitedTextFile;
-import org.apache.tajo.storage.thirdparty.orc.OrcFile.OrcTableProperties;
import org.apache.tajo.util.CommonTestingUtil;
import org.junit.Test;
import org.junit.runner.RunWith;
@@ -124,11 +124,11 @@ private void storageCompressionTest(String dataFormat, Class extends Compressi
meta.putProperty("sequencefile.serde", TextSerializerDeserializer.class.getName());
if (codec.equals(SnappyCodec.class)) {
- meta.putProperty(OrcTableProperties.COMPRESSION.name(), "SNAPPY");
+ meta.putProperty(OrcConf.COMPRESS.getAttribute(), "SNAPPY");
} else if (codec.equals(Lz4Codec.class)) {
- meta.putProperty(OrcTableProperties.COMPRESSION.name(), "ZLIB");
+ meta.putProperty(OrcConf.COMPRESS.getAttribute(), "ZLIB");
} else {
- meta.putProperty(OrcTableProperties.COMPRESSION.name(), "NONE");
+ meta.putProperty(OrcConf.COMPRESS.getAttribute(), "NONE");
}
String fileName = "Compression_" + codec.getSimpleName();
diff --git a/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/testVariousTypes.avsc b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/testVariousTypes.avsc
index f71f0520ec..f1d1368447 100644
--- a/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/testVariousTypes.avsc
+++ b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/testVariousTypes.avsc
@@ -12,8 +12,7 @@
{ "name": "col7", "type": "double" },
{ "name": "col8", "type": "string" },
{ "name": "col9", "type": "bytes" },
- { "name": "col10", "type": "bytes" },
- { "name": "col11", "type": "bytes" }
+ { "name": "col10", "type": "bytes" }
]
}
From bafa8adfa60e917278e65314c42d167bf59ee536 Mon Sep 17 00:00:00 2001
From: Jihoon Son
Date: Sun, 20 Mar 2016 00:15:50 +0900
Subject: [PATCH 03/16] Enable reader options.
---
.../apache/tajo/storage/orc/OrcScanner.java | 21 ++++++++---
.../thirdparty/orc/OrcRecordReader.java | 37 ++++++++-----------
.../thirdparty/orc/TreeReaderFactory.java | 27 +++++---------
3 files changed, 41 insertions(+), 44 deletions(-)
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java
index 8082819bff..0f249cf49a 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java
@@ -29,12 +29,15 @@
import org.apache.hadoop.hive.common.io.DiskRange;
import org.apache.hadoop.io.Text;
import org.apache.orc.*;
+import org.apache.orc.Reader.Options;
import org.apache.orc.impl.BufferChunk;
import org.apache.orc.impl.InStream;
+import org.apache.tajo.TajoConstants;
import org.apache.tajo.catalog.Schema;
import org.apache.tajo.catalog.TableMeta;
import org.apache.tajo.plan.expr.EvalNode;
import org.apache.tajo.storage.FileScanner;
+import org.apache.tajo.storage.StorageConstants;
import org.apache.tajo.storage.Tuple;
import org.apache.tajo.storage.fragment.Fragment;
import org.apache.tajo.storage.thirdparty.orc.OrcRecordReader;
@@ -44,6 +47,7 @@
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
+import java.util.TimeZone;
public class OrcScanner extends FileScanner {
private static final Log LOG = LogFactory.getLog(OrcScanner.class);
@@ -230,11 +234,16 @@ private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs,
);
}
- public OrcRecordReader getRecordReader() throws IOException {
- boolean skipCorruptRecords = conf.getBoolean("orc.skip.corrupt-records", false);
+ public OrcRecordReader createRecordReader() throws IOException {
+ return new OrcRecordReader(this.stripes, fileSystem, schema, targets, fragment, types, codec, bufferSize,
+ rowIndexStride, buildReaderOptions(meta), conf,
+ TimeZone.getTimeZone(meta.getProperty(StorageConstants.TIMEZONE, TajoConstants.DEFAULT_SYSTEM_TIMEZONE)));
+ }
- return new OrcRecordReader(meta, this.stripes, fileSystem, schema, targets, fragment,
- skipCorruptRecords, types, codec, bufferSize, rowIndexStride, conf);
+ private static Options buildReaderOptions(TableMeta meta) {
+ return new Options()
+ .useZeroCopy(Boolean.parseBoolean(meta.getProperty(OrcConf.USE_ZEROCOPY.getAttribute(), String.valueOf(OrcConf.USE_ZEROCOPY.getDefaultValue()))))
+ .skipCorruptRecords(Boolean.parseBoolean(meta.getProperty(OrcConf.SKIP_CORRUPT_DATA.getAttribute(), String.valueOf(OrcConf.SKIP_CORRUPT_DATA.getDefaultValue()))));
}
@Override
@@ -264,7 +273,7 @@ public void init() throws IOException {
this.versionList = footerMetaData.versionList;
this.stripes = convertProtoStripesToStripes(rInfo.footer.getStripesList());
- recordReader = getRecordReader();
+ recordReader = createRecordReader();
}
@Override
@@ -280,7 +289,7 @@ public Tuple next() throws IOException {
public void reset() throws IOException {
// TODO: improve this
this.close();
- recordReader = getRecordReader();
+ recordReader = createRecordReader();
}
@Override
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java
index c018c802d8..7194bf4d50 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java
@@ -39,10 +39,7 @@
import java.io.Closeable;
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
public class OrcRecordReader implements Closeable {
@@ -72,19 +69,18 @@ public class OrcRecordReader implements Closeable {
private final DataReader dataReader;
private final Tuple result;
- public OrcRecordReader(TableMeta meta,
- List stripes,
- FileSystem fileSystem,
- Schema schema,
- Column[] target,
- FileFragment fragment,
- boolean skipCorruptRecords,
- List types,
- CompressionCodec codec,
- int bufferSize,
- long strideRate,
- Configuration conf
- ) throws IOException {
+ public OrcRecordReader(List stripes,
+ FileSystem fileSystem,
+ Schema schema,
+ Column[] target,
+ FileFragment fragment,
+ List types,
+ CompressionCodec codec,
+ int bufferSize,
+ long strideRate,
+ Reader.Options options,
+ Configuration conf,
+ TimeZone timeZone) throws IOException {
result = new VTuple(target.length);
@@ -117,17 +113,16 @@ public OrcRecordReader(TableMeta meta,
}
// TODO: we could change the ctor to pass this externally
- this.dataReader = RecordReaderUtils.createDefaultDataReader(fileSystem, path, true, codec);
+ this.dataReader = RecordReaderUtils.createDefaultDataReader(fileSystem, path, options.getUseZeroCopy(), codec);
this.dataReader.open();
firstRow = skippedRows;
totalRowCount = rows;
- Boolean skipCorrupt = skipCorruptRecords;
reader = new DatumTreeReader[target.length];
for (int i = 0; i < reader.length; i++) {
- reader[i] = TreeReaderFactory.createTreeReader(meta, schema.getColumnId(target[i].getQualifiedName()), target[i],
- skipCorrupt);
+ reader[i] = TreeReaderFactory.createTreeReader(timeZone, schema.getColumnId(target[i].getQualifiedName()), target[i],
+ options.getSkipCorruptRecords());
}
indexes = new OrcProto.RowIndex[types.size()];
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java
index 136e5a7b5d..b31523f32b 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java
@@ -24,21 +24,14 @@
import org.apache.hadoop.io.Text;
import org.apache.orc.OrcProto;
import org.apache.orc.impl.*;
-import org.apache.orc.impl.DynamicByteArray;
-import org.apache.orc.impl.SerializationUtils;
-import org.apache.orc.impl.StreamName;
import org.apache.orc.impl.WriterImpl;
-import org.apache.tajo.TajoConstants;
import org.apache.tajo.catalog.Column;
-import org.apache.tajo.catalog.TableMeta;
import org.apache.tajo.catalog.TypeDesc;
import org.apache.tajo.datum.Datum;
import org.apache.tajo.datum.DatumFactory;
import org.apache.tajo.datum.NullDatum;
import org.apache.tajo.exception.TajoRuntimeException;
import org.apache.tajo.exception.UnsupportedException;
-import org.apache.tajo.storage.StorageConstants;
-import org.apache.tajo.unit.TimeUnit;
import org.apache.tajo.util.datetime.DateTimeConstants;
import org.apache.tajo.util.datetime.DateTimeUtil;
@@ -759,11 +752,11 @@ public static class TimestampTreeReader extends DatumTreeReader {
private TimeZone writerTimeZone;
private boolean hasSameTZRules;
- TimestampTreeReader(TableMeta meta, int columnId, boolean skipCorrupt) throws IOException {
- this(meta, columnId, null, null, null, null, skipCorrupt);
+ TimestampTreeReader(TimeZone timeZone, int columnId, boolean skipCorrupt) throws IOException {
+ this(timeZone, columnId, null, null, null, null, skipCorrupt);
}
- protected TimestampTreeReader(TableMeta meta, int columnId, InStream presentStream, InStream dataStream,
+ protected TimestampTreeReader(TimeZone timeZone, int columnId, InStream presentStream, InStream dataStream,
InStream nanosStream, OrcProto.ColumnEncoding encoding, boolean skipCorrupt)
throws IOException {
super(columnId, presentStream);
@@ -772,8 +765,7 @@ protected TimestampTreeReader(TableMeta meta, int columnId, InStream presentStre
this.readerTimeZone = TimeZone.getDefault();
this.writerTimeZone = readerTimeZone;
this.hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone);
- this.base_timestamp = getBaseTimestamp(TimeZone.getTimeZone(meta.getProperty(StorageConstants.TIMEZONE,
- TajoConstants.DEFAULT_SYSTEM_TIMEZONE)).getID());
+ this.base_timestamp = getBaseTimestamp(timeZone.getID());
if (encoding != null) {
checkEncoding(encoding);
@@ -1414,6 +1406,7 @@ void skipRows(long items) throws IOException {
}
}
+ // TODO: enable this to support record type
// protected static class StructTreeReader extends TreeReader {
// private final int fileColumnCount;
// private final int resultColumnCount;
@@ -1533,10 +1526,10 @@ void skipRows(long items) throws IOException {
// }
// }
- public static DatumTreeReader createTreeReader(TableMeta meta,
- int columnId,
- Column column,
- boolean skipCorrupt
+ public static DatumTreeReader createTreeReader(TimeZone timeZone,
+ int columnId,
+ Column column,
+ boolean skipCorrupt
) throws IOException {
TypeDesc typeDesc = column.getTypeDesc();
int orcColumnId = columnId + 1; // root record column is considered
@@ -1562,7 +1555,7 @@ public static DatumTreeReader createTreeReader(TableMeta meta,
case BLOB:
return new BinaryTreeReader(orcColumnId);
case TIMESTAMP:
- return new TimestampTreeReader(meta, orcColumnId, skipCorrupt);
+ return new TimestampTreeReader(timeZone, orcColumnId, skipCorrupt);
case DATE:
return new DateTreeReader(orcColumnId);
case INET4:
From 6414385a139850cd7819762c9e4868f7c055d3c7 Mon Sep 17 00:00:00 2001
From: Jihoon Son
Date: Sun, 20 Mar 2016 00:23:51 +0900
Subject: [PATCH 04/16] Remove proto file and cleanup code.
---
.../catalog/store/TestHiveCatalogStore.java | 2 +-
tajo-storage/tajo-storage-hdfs/pom.xml | 1 -
.../apache/tajo/storage/orc/ORCAppender.java | 24 +-
.../apache/tajo/storage/orc/OrcScanner.java | 6 +-
.../tajo/storage/thirdparty/orc/OrcUtils.java | 7 +-
.../storage/thirdparty/orc/WriterImpl.java | 1 -
.../src/main/proto/orc_proto.proto | 219 ------------------
7 files changed, 23 insertions(+), 237 deletions(-)
delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto
diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java
index 6bb66a1a46..46935fc259 100644
--- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java
+++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java
@@ -78,7 +78,7 @@ public static void setUp() throws Exception {
conf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, warehousePath.toUri().toString());
conf.set(HiveConf.ConfVars.METASTORECONNECTURLKEY.varname, jdbcUri);
conf.set(TajoConf.ConfVars.WAREHOUSE_DIR.varname, warehousePath.toUri().toString());
- conf.setBoolean("datanucleus.schema.autoCreateAll", true); // TODO: check this is valid
+ conf.setBoolean("datanucleus.schema.autoCreateAll", true);
// create local HiveCatalogStore.
TajoConf tajoConf = new TajoConf(conf);
diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml
index 2c5da75ef7..8688b29279 100644
--- a/tajo-storage/tajo-storage-hdfs/pom.xml
+++ b/tajo-storage/tajo-storage-hdfs/pom.xml
@@ -129,7 +129,6 @@
--proto_path=../../tajo-catalog/tajo-catalog-common/src/main/proto--java_out=target/generated-sources/protosrc/main/proto/StorageFragmentProtos.proto
-
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java
index ec4349628c..b283b2219b 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java
@@ -111,14 +111,22 @@ private static OrcFile.WriterOptions buildWriterOptions(Configuration conf, Tabl
return OrcFile.writerOptions(conf)
.setSchema(OrcUtils.convertSchema(schema))
.compress(getCompressionKind(meta))
- .stripeSize(Long.parseLong(meta.getProperty(OrcConf.STRIPE_SIZE.getAttribute(), String.valueOf(OrcConf.STRIPE_SIZE.getDefaultValue()))))
- .blockSize(Long.parseLong(meta.getProperty(OrcConf.BLOCK_SIZE.getAttribute(), String.valueOf(OrcConf.BLOCK_SIZE.getDefaultValue()))))
- .rowIndexStride(Integer.parseInt(meta.getProperty(OrcConf.ROW_INDEX_STRIDE.getAttribute(), String.valueOf(OrcConf.ROW_INDEX_STRIDE.getDefaultValue()))))
- .bufferSize(Integer.parseInt(meta.getProperty(OrcConf.BUFFER_SIZE.getAttribute(), String.valueOf(OrcConf.BUFFER_SIZE.getDefaultValue()))))
- .blockPadding(Boolean.parseBoolean(meta.getProperty(OrcConf.BLOCK_PADDING.getAttribute(), String.valueOf(OrcConf.BLOCK_PADDING.getDefaultValue()))))
- .encodingStrategy(EncodingStrategy.valueOf(meta.getProperty(OrcConf.ENCODING_STRATEGY.getAttribute(), String.valueOf(OrcConf.ENCODING_STRATEGY.getDefaultValue()))))
- .bloomFilterFpp(Double.parseDouble(meta.getProperty(OrcConf.BLOOM_FILTER_FPP.getAttribute(), String.valueOf(OrcConf.BLOOM_FILTER_FPP.getDefaultValue()))))
- .bloomFilterColumns(meta.getProperty(OrcConf.BLOOM_FILTER_COLUMNS.getAttribute(), String.valueOf(OrcConf.BLOOM_FILTER_COLUMNS.getDefaultValue())));
+ .stripeSize(Long.parseLong(meta.getProperty(OrcConf.STRIPE_SIZE.getAttribute(),
+ String.valueOf(OrcConf.STRIPE_SIZE.getDefaultValue()))))
+ .blockSize(Long.parseLong(meta.getProperty(OrcConf.BLOCK_SIZE.getAttribute(),
+ String.valueOf(OrcConf.BLOCK_SIZE.getDefaultValue()))))
+ .rowIndexStride(Integer.parseInt(meta.getProperty(OrcConf.ROW_INDEX_STRIDE.getAttribute(),
+ String.valueOf(OrcConf.ROW_INDEX_STRIDE.getDefaultValue()))))
+ .bufferSize(Integer.parseInt(meta.getProperty(OrcConf.BUFFER_SIZE.getAttribute(),
+ String.valueOf(OrcConf.BUFFER_SIZE.getDefaultValue()))))
+ .blockPadding(Boolean.parseBoolean(meta.getProperty(OrcConf.BLOCK_PADDING.getAttribute(),
+ String.valueOf(OrcConf.BLOCK_PADDING.getDefaultValue()))))
+ .encodingStrategy(EncodingStrategy.valueOf(meta.getProperty(OrcConf.ENCODING_STRATEGY.getAttribute(),
+ String.valueOf(OrcConf.ENCODING_STRATEGY.getDefaultValue()))))
+ .bloomFilterFpp(Double.parseDouble(meta.getProperty(OrcConf.BLOOM_FILTER_FPP.getAttribute(),
+ String.valueOf(OrcConf.BLOOM_FILTER_FPP.getDefaultValue()))))
+ .bloomFilterColumns(meta.getProperty(OrcConf.BLOOM_FILTER_COLUMNS.getAttribute(),
+ String.valueOf(OrcConf.BLOOM_FILTER_COLUMNS.getDefaultValue())));
}
private static CompressionKind getCompressionKind(TableMeta meta) {
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java
index 0f249cf49a..5d9dfac54f 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java
@@ -242,8 +242,10 @@ rowIndexStride, buildReaderOptions(meta), conf,
private static Options buildReaderOptions(TableMeta meta) {
return new Options()
- .useZeroCopy(Boolean.parseBoolean(meta.getProperty(OrcConf.USE_ZEROCOPY.getAttribute(), String.valueOf(OrcConf.USE_ZEROCOPY.getDefaultValue()))))
- .skipCorruptRecords(Boolean.parseBoolean(meta.getProperty(OrcConf.SKIP_CORRUPT_DATA.getAttribute(), String.valueOf(OrcConf.SKIP_CORRUPT_DATA.getDefaultValue()))));
+ .useZeroCopy(Boolean.parseBoolean(meta.getProperty(OrcConf.USE_ZEROCOPY.getAttribute(),
+ String.valueOf(OrcConf.USE_ZEROCOPY.getDefaultValue()))))
+ .skipCorruptRecords(Boolean.parseBoolean(meta.getProperty(OrcConf.SKIP_CORRUPT_DATA.getAttribute(),
+ String.valueOf(OrcConf.SKIP_CORRUPT_DATA.getDefaultValue()))));
}
@Override
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java
index 91e4dc60d4..cc0e08e20a 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java
@@ -17,20 +17,17 @@
*/
package org.apache.tajo.storage.thirdparty.orc;
-import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.serde2.objectinspector.*;
-import org.apache.hadoop.hive.serde2.typeinfo.*;
import org.apache.orc.CompressionCodec;
import org.apache.orc.TypeDescription;
-import org.apache.orc.TypeDescription.Category;
import org.apache.orc.impl.SnappyCodec;
+import org.apache.orc.impl.ZlibCodec;
import org.apache.tajo.catalog.Column;
import org.apache.tajo.catalog.Schema;
import org.apache.tajo.catalog.TypeDesc;
-import org.apache.tajo.common.TajoDataTypes.Type;
import org.apache.tajo.exception.TajoRuntimeException;
import org.apache.tajo.exception.UnsupportedDataTypeException;
@@ -215,7 +212,7 @@ public static org.apache.orc.CompressionCodec createCodec(org.apache.orc.Compres
case NONE:
return null;
case ZLIB:
- return new org.apache.orc.impl.ZlibCodec();
+ return new ZlibCodec();
case SNAPPY:
return new SnappyCodec();
case LZO:
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java
index 032885dece..e0ad3d7bed 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java
@@ -870,7 +870,6 @@ void write(Datum datum) throws IOException {
super.write(datum);
if (datum != null && datum.isNotNull()) {
boolean val = datum.asBool();
- // TODO: validate the below line
indexStatistics.updateBoolean(val, 1);
writer.write(val ? 1 : 0);
}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto b/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto
deleted file mode 100644
index 9da4b5d889..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto
+++ /dev/null
@@ -1,219 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package orc.proto;
-
-option java_package = "org.apache.orc";
-
-message IntegerStatistics {
- optional sint64 minimum = 1;
- optional sint64 maximum = 2;
- optional sint64 sum = 3;
-}
-
-message DoubleStatistics {
- optional double minimum = 1;
- optional double maximum = 2;
- optional double sum = 3;
-}
-
-message StringStatistics {
- optional string minimum = 1;
- optional string maximum = 2;
- // sum will store the total length of all strings in a stripe
- optional sint64 sum = 3;
-}
-
-message BucketStatistics {
- repeated uint64 count = 1 [packed=true];
-}
-
-message DecimalStatistics {
- optional string minimum = 1;
- optional string maximum = 2;
- optional string sum = 3;
-}
-
-message DateStatistics {
- // min,max values saved as days since epoch
- optional sint32 minimum = 1;
- optional sint32 maximum = 2;
-}
-
-message TimestampStatistics {
- // min,max values saved as milliseconds since epoch
- optional sint64 minimum = 1;
- optional sint64 maximum = 2;
-}
-
-message BinaryStatistics {
- // sum will store the total binary blob length in a stripe
- optional sint64 sum = 1;
-}
-
-message ColumnStatistics {
- optional uint64 numberOfValues = 1;
- optional IntegerStatistics intStatistics = 2;
- optional DoubleStatistics doubleStatistics = 3;
- optional StringStatistics stringStatistics = 4;
- optional BucketStatistics bucketStatistics = 5;
- optional DecimalStatistics decimalStatistics = 6;
- optional DateStatistics dateStatistics = 7;
- optional BinaryStatistics binaryStatistics = 8;
- optional TimestampStatistics timestampStatistics = 9;
- optional bool hasNull = 10;
-}
-
-message RowIndexEntry {
- repeated uint64 positions = 1 [packed=true];
- optional ColumnStatistics statistics = 2;
-}
-
-message RowIndex {
- repeated RowIndexEntry entry = 1;
-}
-
-message BloomFilter {
- optional uint32 numHashFunctions = 1;
- repeated fixed64 bitset = 2;
-}
-
-message BloomFilterIndex {
- repeated BloomFilter bloomFilter = 1;
-}
-
-message Stream {
- // if you add new index stream kinds, you need to make sure to update
- // StreamName to ensure it is added to the stripe in the right area
- enum Kind {
- PRESENT = 0;
- DATA = 1;
- LENGTH = 2;
- DICTIONARY_DATA = 3;
- DICTIONARY_COUNT = 4;
- SECONDARY = 5;
- ROW_INDEX = 6;
- BLOOM_FILTER = 7;
- }
- optional Kind kind = 1;
- optional uint32 column = 2;
- optional uint64 length = 3;
-}
-
-message ColumnEncoding {
- enum Kind {
- DIRECT = 0;
- DICTIONARY = 1;
- DIRECT_V2 = 2;
- DICTIONARY_V2 = 3;
- }
- optional Kind kind = 1;
- optional uint32 dictionarySize = 2;
-}
-
-message StripeFooter {
- repeated Stream streams = 1;
- repeated ColumnEncoding columns = 2;
- optional string writerTimezone = 3;
-}
-
-message Type {
- enum Kind {
- BOOLEAN = 0;
- BYTE = 1;
- SHORT = 2;
- INT = 3;
- LONG = 4;
- FLOAT = 5;
- DOUBLE = 6;
- STRING = 7;
- BINARY = 8;
- TIMESTAMP = 9;
- LIST = 10;
- MAP = 11;
- STRUCT = 12;
- UNION = 13;
- DECIMAL = 14;
- DATE = 15;
- VARCHAR = 16;
- CHAR = 17;
- }
- optional Kind kind = 1;
- repeated uint32 subtypes = 2 [packed=true];
- repeated string fieldNames = 3;
- optional uint32 maximumLength = 4;
- optional uint32 precision = 5;
- optional uint32 scale = 6;
-}
-
-message StripeInformation {
- optional uint64 offset = 1;
- optional uint64 indexLength = 2;
- optional uint64 dataLength = 3;
- optional uint64 footerLength = 4;
- optional uint64 numberOfRows = 5;
-}
-
-message UserMetadataItem {
- optional string name = 1;
- optional bytes value = 2;
-}
-
-message StripeStatistics {
- repeated ColumnStatistics colStats = 1;
-}
-
-message Metadata {
- repeated StripeStatistics stripeStats = 1;
-}
-
-message Footer {
- optional uint64 headerLength = 1;
- optional uint64 contentLength = 2;
- repeated StripeInformation stripes = 3;
- repeated Type types = 4;
- repeated UserMetadataItem metadata = 5;
- optional uint64 numberOfRows = 6;
- repeated ColumnStatistics statistics = 7;
- optional uint32 rowIndexStride = 8;
-}
-
-enum CompressionKind {
- NONE = 0;
- ZLIB = 1;
- SNAPPY = 2;
- LZO = 3;
-}
-
-// Serialized length must be less that 255 bytes
-message PostScript {
- optional uint64 footerLength = 1;
- optional CompressionKind compression = 2;
- optional uint64 compressionBlockSize = 3;
- // the version of the file format
- // [0, 11] = Hive 0.11
- // [0, 12] = Hive 0.12
- repeated uint32 version = 4 [packed = true];
- optional uint64 metadataLength = 5;
- // Version of the writer:
- // 0 (or missing) = original
- // 1 = HIVE-8732 fixed
- optional uint32 writerVersion = 6;
- // Leave this last in the record
- optional string magic = 8000;
-}
From 2b9dd2443a5ae1133031846fe7d639a35e7ff8a5 Mon Sep 17 00:00:00 2001
From: Jihoon Son
Date: Sun, 20 Mar 2016 13:28:12 +0900
Subject: [PATCH 05/16] Fix test failure
---
.../src/main/resources/storage-default.xml | 2 +-
.../src/test/resources/storage-default.xml | 2 +-
.../apache/tajo/storage/orc/OrcScanner.java | 2 +
.../thirdparty/orc/TreeReaderFactory.java | 47 ++++++++++---------
4 files changed, 30 insertions(+), 23 deletions(-)
diff --git a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml
index 7f4661b451..2454714452 100644
--- a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml
+++ b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml
@@ -130,7 +130,7 @@
tajo.storage.scanner-handler.orc.class
- org.apache.tajo.storage.orc.ORCScanner
+ org.apache.tajo.storage.orc.OrcScanner
diff --git a/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml b/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml
index 934dd01f24..1c4530a3cd 100644
--- a/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml
+++ b/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml
@@ -132,7 +132,7 @@
tajo.storage.scanner-handler.orc.class
- org.apache.tajo.storage.orc.ORCScanner
+ org.apache.tajo.storage.orc.OrcScanner
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java
index 5d9dfac54f..86fe7ad2de 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java
@@ -32,9 +32,11 @@
import org.apache.orc.Reader.Options;
import org.apache.orc.impl.BufferChunk;
import org.apache.orc.impl.InStream;
+import org.apache.tajo.SessionVars;
import org.apache.tajo.TajoConstants;
import org.apache.tajo.catalog.Schema;
import org.apache.tajo.catalog.TableMeta;
+import org.apache.tajo.conf.TajoConf.ConfVars;
import org.apache.tajo.plan.expr.EvalNode;
import org.apache.tajo.storage.FileScanner;
import org.apache.tajo.storage.StorageConstants;
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java
index b31523f32b..9b3f568fd2 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java
@@ -24,7 +24,6 @@
import org.apache.hadoop.io.Text;
import org.apache.orc.OrcProto;
import org.apache.orc.impl.*;
-import org.apache.orc.impl.WriterImpl;
import org.apache.tajo.catalog.Column;
import org.apache.tajo.catalog.TypeDesc;
import org.apache.tajo.datum.Datum;
@@ -38,6 +37,7 @@
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
+import java.sql.Timestamp;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.HashMap;
@@ -45,6 +45,8 @@
import java.util.Map;
import java.util.TimeZone;
+import static org.apache.tajo.storage.thirdparty.orc.WriterImpl.BASE_TIMESTAMP_STRING;
+
public class TreeReaderFactory {
private final static Log LOG = LogFactory.getLog(TreeReaderFactory.class);
@@ -751,6 +753,7 @@ public static class TimestampTreeReader extends DatumTreeReader {
private final TimeZone readerTimeZone;
private TimeZone writerTimeZone;
private boolean hasSameTZRules;
+ private final TimeZone timeZone;
TimestampTreeReader(TimeZone timeZone, int columnId, boolean skipCorrupt) throws IOException {
this(timeZone, columnId, null, null, null, null, skipCorrupt);
@@ -765,7 +768,7 @@ protected TimestampTreeReader(TimeZone timeZone, int columnId, InStream presentS
this.readerTimeZone = TimeZone.getDefault();
this.writerTimeZone = readerTimeZone;
this.hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone);
- this.base_timestamp = getBaseTimestamp(timeZone.getID());
+ this.base_timestamp = getBaseTimestamp(readerTimeZone.getID());
if (encoding != null) {
checkEncoding(encoding);
@@ -777,6 +780,7 @@ protected TimestampTreeReader(TimeZone timeZone, int columnId, InStream presentS
this.nanos = createIntegerReader(encoding.getKind(), nanosStream, false, skipCorrupt);
}
}
+ this.timeZone = timeZone;
}
@Override
@@ -800,6 +804,7 @@ void startStripe(Map streams,
streams.get(new org.apache.orc.impl.StreamName(columnId,
OrcProto.Stream.Kind.SECONDARY)), false, skipCorrupt);
base_timestamp = getBaseTimestamp(stripeFooter.getWriterTimezone());
+ this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / DateTimeConstants.MSECS_PER_SEC;
}
private long getBaseTimestamp(String timeZoneId) throws IOException {
@@ -814,8 +819,7 @@ private long getBaseTimestamp(String timeZoneId) throws IOException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
sdf.setTimeZone(writerTimeZone);
try {
- long epoch =
- sdf.parse(org.apache.orc.impl.WriterImpl.BASE_TIMESTAMP_STRING).getTime() / WriterImpl.MILLIS_PER_SECOND;
+ long epoch = sdf.parse(BASE_TIMESTAMP_STRING).getTime() / DateTimeConstants.MSECS_PER_SEC;
baseTimestampMap.put(timeZoneId, epoch);
return epoch;
} catch (ParseException e) {
@@ -846,23 +850,24 @@ Datum next() throws IOException {
if (valuePresent) {
long millis = decodeTimestamp(data.next(), nanos.next(), base_timestamp);
- long offset = 0;
- // If reader and writer time zones have different rules, adjust the timezone difference
- // between reader and writer taking day light savings into account.
- if (!hasSameTZRules) {
- offset = writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(millis);
- }
- long adjustedMillis = millis + offset;
-
- // Sometimes the reader timezone might have changed after adding the adjustedMillis.
- // To account for that change, check for any difference in reader timezone after
- // adding adjustedMillis. If so use the new offset (offset at adjustedMillis point of time).
- if (!hasSameTZRules &&
- (readerTimeZone.getOffset(millis) != readerTimeZone.getOffset(adjustedMillis))) {
- long newOffset =
- writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(adjustedMillis);
- adjustedMillis = millis + newOffset;
- }
+ long adjustedMillis = millis - timeZone.getRawOffset();
+// long offset = 0;
+// // If reader and writer time zones have different rules, adjust the timezone difference
+// // between reader and writer taking day light savings into account.
+// if (!hasSameTZRules) {
+// offset = writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(millis);
+// }
+// long adjustedMillis = millis + offset;
+//
+// // Sometimes the reader timezone might have changed after adding the adjustedMillis.
+// // To account for that change, check for any difference in reader timezone after
+// // adding adjustedMillis. If so use the new offset (offset at adjustedMillis point of time).
+// if (!hasSameTZRules &&
+// (readerTimeZone.getOffset(millis) != readerTimeZone.getOffset(adjustedMillis))) {
+// long newOffset =
+// writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(adjustedMillis);
+// adjustedMillis = millis + newOffset;
+// }
return DatumFactory.createTimestamp(DateTimeUtil.javaTimeToJulianTime(adjustedMillis));
} else {
return NullDatum.get();
From 70f846477044b1fbc25bdde5a852d9dccd51ee33 Mon Sep 17 00:00:00 2001
From: Jihoon Son
Date: Sun, 20 Mar 2016 21:37:36 +0900
Subject: [PATCH 06/16] Add some tests
---
.../tajo-catalog-drivers/tajo-hive/pom.xml | 6 +-
.../tajo/catalog/store/HiveCatalogStore.java | 3 +-
.../tajo/catalog/store/HiveCatalogUtil.java | 3 +
.../tajo/engine/query/TestSelectQuery.java | 19 -----
.../tajo/storage/TestQueryOnOrcFile.java | 79 +++++++++++++++++++
.../timezoned/timezoned1.tbl} | 0
.../datetime_table_timezoned_ddl.sql | 5 ++
.../datetime_table_timezoned_orc_ddl.sql | 0
.../TestQueryOnOrcFile/testTimezone1.sql | 1 +
.../TestSelectQuery/testTimezonedORCTable.sql | 2 -
.../testTimezone1.result} | 0
.../TestQueryOnOrcFile/testTimezone2.result | 5 ++
.../TestQueryOnOrcFile/testTimezone3.result | 5 ++
.../TestQueryOnOrcFile/testTimezone4.result | 5 ++
tajo-dist/pom.xml | 9 +--
tajo-project/pom.xml | 1 +
tajo-storage/tajo-storage-hdfs/pom.xml | 26 +++++-
.../apache/tajo/storage/orc/OrcScanner.java | 7 +-
18 files changed, 136 insertions(+), 40 deletions(-)
create mode 100644 tajo-core-tests/src/test/java/org/apache/tajo/storage/TestQueryOnOrcFile.java
rename tajo-core-tests/src/test/resources/dataset/{TestSelectQuery/timezoned/table1.tbl => TestQueryOnOrcFile/timezoned/timezoned1.tbl} (100%)
create mode 100644 tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/datetime_table_timezoned_ddl.sql
rename tajo-core-tests/src/test/resources/queries/{TestSelectQuery => TestQueryOnOrcFile}/datetime_table_timezoned_orc_ddl.sql (100%)
create mode 100644 tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/testTimezone1.sql
delete mode 100644 tajo-core-tests/src/test/resources/queries/TestSelectQuery/testTimezonedORCTable.sql
rename tajo-core-tests/src/test/resources/results/{TestSelectQuery/testTimezonedORCTable.result => TestQueryOnOrcFile/testTimezone1.result} (100%)
create mode 100644 tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone2.result
create mode 100644 tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone3.result
create mode 100644 tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone4.result
diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml b/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml
index 1a8a188b79..a1e0c98b57 100644
--- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml
+++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml
@@ -33,8 +33,6 @@
UTF-8UTF-8
- 1.5.0
- 2.1.0
@@ -279,8 +277,8 @@
- com.twitter
- parquet-hive-bundle
+ org.apache.parquet
+ parquet-hadoop-bundle${parquet.version}
diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java
index 1d0d261d12..cad3c2414a 100644
--- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java
+++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java
@@ -38,6 +38,7 @@
import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
import org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe;
import org.apache.hadoop.mapred.TextInputFormat;
+import org.apache.parquet.hadoop.ParquetOutputFormat;
import org.apache.tajo.BuiltinStorages;
import org.apache.tajo.TajoConstants;
import org.apache.tajo.algebra.Expr;
@@ -57,10 +58,8 @@
import org.apache.tajo.storage.StorageConstants;
import org.apache.tajo.util.KeyValueSet;
import org.apache.thrift.TException;
-import parquet.hadoop.ParquetOutputFormat;
import java.io.File;
-import java.io.IOException;
import java.util.*;
public class HiveCatalogStore extends CatalogConstants implements CatalogStore {
diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogUtil.java b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogUtil.java
index bbb7adeee3..87b391ea60 100644
--- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogUtil.java
+++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogUtil.java
@@ -22,6 +22,7 @@
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.ql.io.RCFileInputFormat;
+import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.serde.serdeConstants;
@@ -137,6 +138,8 @@ public static String getDataFormat(StorageDescriptor descriptor) {
return BuiltinStorages.PARQUET;
} else if (AvroSerDe.class.getName().equals(serde)) {
return BuiltinStorages.AVRO;
+ } else if (OrcSerde.class.getName().equals(serde)) {
+ return BuiltinStorages.ORC;
} else {
throw new TajoRuntimeException(new UnknownDataFormatException(inputFormat));
}
diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java
index e55acf1fc0..a2dec50e91 100644
--- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java
+++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java
@@ -682,25 +682,6 @@ public void testLoadIntoTimezonedTable() throws Exception {
executeString("DROP TABLE IF EXISTS timezoned_load2 PURGE");
}
}
-
- @Test
- public void testTimezonedORCTable() throws Exception {
- try {
-
- executeDDL("datetime_table_timezoned_ddl.sql", "timezoned", "timezoned");
- executeDDL("datetime_table_timezoned_orc_ddl.sql", null, "timezoned_orc");
-
- executeString("INSERT OVERWRITE INTO timezoned_orc SELECT t_timestamp, t_date FROM timezoned");
-
- ResultSet res = executeQuery();
- assertResultSet(res, "testTimezonedORCTable.result");
- executeString("SET TIME ZONE 'GMT'");
- cleanupQuery(res);
- } finally {
- executeString("DROP TABLE IF EXISTS timezoned");
- executeString("DROP TABLE IF EXISTS timezoned_orc PURGE");
- }
- }
@Test
public void testMultiBytesDelimiter1() throws Exception {
diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/storage/TestQueryOnOrcFile.java b/tajo-core-tests/src/test/java/org/apache/tajo/storage/TestQueryOnOrcFile.java
new file mode 100644
index 0000000000..29d132e35f
--- /dev/null
+++ b/tajo-core-tests/src/test/java/org/apache/tajo/storage/TestQueryOnOrcFile.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage;
+
+import org.apache.tajo.IntegrationTest;
+import org.apache.tajo.QueryTestCaseBase;
+import org.junit.*;
+import org.junit.experimental.categories.Category;
+
+import java.sql.ResultSet;
+
+@Category(IntegrationTest.class)
+public class TestQueryOnOrcFile extends QueryTestCaseBase {
+
+ @Before
+ public void setup() throws Exception {
+ executeDDL("datetime_table_timezoned_ddl.sql", "timezoned", "timezoned");
+ executeDDL("datetime_table_timezoned_orc_ddl.sql", null, "timezoned_orc");
+
+ executeString("INSERT OVERWRITE INTO timezoned_orc SELECT t_timestamp, t_date FROM timezoned");
+ }
+
+ @After
+ public void teardown() throws Exception {
+ executeString("DROP TABLE IF EXISTS timezoned");
+ executeString("DROP TABLE IF EXISTS timezoned_orc PURGE");
+ }
+
+ @Test
+ public void testTimezone1() throws Exception {
+ executeString("SET TIME ZONE 'GMT+9'");
+ ResultSet res = executeQuery();
+ assertResultSet(res);
+ executeString("SET TIME ZONE 'GMT'");
+ cleanupQuery(res);
+ }
+
+ @Test
+ public void testTimezone2() throws Exception {
+ executeString("SET TIME ZONE 'GMT+1'");
+ ResultSet res = executeString("select * from timezoned_orc");
+ assertResultSet(res);
+ executeString("SET TIME ZONE 'GMT'");
+ cleanupQuery(res);
+ }
+
+ @Test
+ public void testTimezone3() throws Exception {
+ executeString("SET TIME ZONE 'GMT'");
+ ResultSet res = executeString("select * from timezoned_orc");
+ assertResultSet(res);
+ cleanupQuery(res);
+ }
+
+ @Test
+ public void testTimezone4() throws Exception {
+ executeString("\\set TIMEZONE 'GMT-5'");
+ ResultSet res = executeString("select * from timezoned_orc");
+ assertResultSet(res);
+ executeString("SET TIME ZONE 'GMT'");
+ cleanupQuery(res);
+ }
+}
diff --git a/tajo-core-tests/src/test/resources/dataset/TestSelectQuery/timezoned/table1.tbl b/tajo-core-tests/src/test/resources/dataset/TestQueryOnOrcFile/timezoned/timezoned1.tbl
similarity index 100%
rename from tajo-core-tests/src/test/resources/dataset/TestSelectQuery/timezoned/table1.tbl
rename to tajo-core-tests/src/test/resources/dataset/TestQueryOnOrcFile/timezoned/timezoned1.tbl
diff --git a/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/datetime_table_timezoned_ddl.sql b/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/datetime_table_timezoned_ddl.sql
new file mode 100644
index 0000000000..9c5d30d22c
--- /dev/null
+++ b/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/datetime_table_timezoned_ddl.sql
@@ -0,0 +1,5 @@
+CREATE EXTERNAL TABLE ${0} (
+ t_timestamp TIMESTAMP,
+ t_time TIME,
+ t_date DATE
+) USING TEXT WITH ('timezone' = 'GMT+9') LOCATION ${table.path}
diff --git a/tajo-core-tests/src/test/resources/queries/TestSelectQuery/datetime_table_timezoned_orc_ddl.sql b/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/datetime_table_timezoned_orc_ddl.sql
similarity index 100%
rename from tajo-core-tests/src/test/resources/queries/TestSelectQuery/datetime_table_timezoned_orc_ddl.sql
rename to tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/datetime_table_timezoned_orc_ddl.sql
diff --git a/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/testTimezone1.sql b/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/testTimezone1.sql
new file mode 100644
index 0000000000..2464c974ae
--- /dev/null
+++ b/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/testTimezone1.sql
@@ -0,0 +1 @@
+SELECT * FROM timezoned_orc;
\ No newline at end of file
diff --git a/tajo-core-tests/src/test/resources/queries/TestSelectQuery/testTimezonedORCTable.sql b/tajo-core-tests/src/test/resources/queries/TestSelectQuery/testTimezonedORCTable.sql
deleted file mode 100644
index 1d898bd73c..0000000000
--- a/tajo-core-tests/src/test/resources/queries/TestSelectQuery/testTimezonedORCTable.sql
+++ /dev/null
@@ -1,2 +0,0 @@
-SET SESSION TIMEZONE = 'GMT+9';
-SELECT * FROM timezoned_orc;
\ No newline at end of file
diff --git a/tajo-core-tests/src/test/resources/results/TestSelectQuery/testTimezonedORCTable.result b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone1.result
similarity index 100%
rename from tajo-core-tests/src/test/resources/results/TestSelectQuery/testTimezonedORCTable.result
rename to tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone1.result
diff --git a/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone2.result b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone2.result
new file mode 100644
index 0000000000..c0e5ceffe1
--- /dev/null
+++ b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone2.result
@@ -0,0 +1,5 @@
+t_timestamp,t_date
+-------------------------------
+1980-03-31 17:50:30.01,1980-04-01
+1980-03-31 17:50:30,1980-04-01
+1980-03-31 17:50:30,1980-04-01
\ No newline at end of file
diff --git a/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone3.result b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone3.result
new file mode 100644
index 0000000000..916f4be8dd
--- /dev/null
+++ b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone3.result
@@ -0,0 +1,5 @@
+t_timestamp,t_date
+-------------------------------
+1980-03-31 16:50:30.01,1980-04-01
+1980-03-31 16:50:30,1980-04-01
+1980-03-31 16:50:30,1980-04-01
\ No newline at end of file
diff --git a/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone4.result b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone4.result
new file mode 100644
index 0000000000..98e0918610
--- /dev/null
+++ b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone4.result
@@ -0,0 +1,5 @@
+t_timestamp,t_date
+-------------------------------
+1980-03-31 11:50:30.01,1980-04-01
+1980-03-31 11:50:30,1980-04-01
+1980-03-31 11:50:30,1980-04-01
\ No newline at end of file
diff --git a/tajo-dist/pom.xml b/tajo-dist/pom.xml
index 095f128809..a91c431a60 100644
--- a/tajo-dist/pom.xml
+++ b/tajo-dist/pom.xml
@@ -162,13 +162,8 @@
run mkdir -p extlib
- if [ -f $ROOT/tajo-catalog/tajo-catalog-drivers/tajo-hive/target/lib/parquet-hive-bundle-*.jar ]
- then
- run cp -r $ROOT/tajo-catalog/tajo-catalog-drivers/tajo-hive/target/lib/parquet-hive-bundle-*.jar lib/
- echo
- echo "Tajo installed parquet-hive-bundle library at: ${project.build.directory}/tajo-${project.version}"
- echo
- fi
+ run mkdir -p lib
+ run cp -r $ROOT/tajo-storage/tajo-storage-hdfs/target/lib/hive-*.jar lib/
echo
echo "Tajo dist layout available at: ${project.build.directory}/tajo-${project.version}"
diff --git a/tajo-project/pom.xml b/tajo-project/pom.xml
index 16e1eb074f..27fa66be32 100644
--- a/tajo-project/pom.xml
+++ b/tajo-project/pom.xml
@@ -40,6 +40,7 @@
4.0.34.Final2.66.1.26
+ 1.8.1${project.parent.relativePath}/..src/main/hadoop-${hadoop.version}
diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml
index 8688b29279..a3a46fe416 100644
--- a/tajo-storage/tajo-storage-hdfs/pom.xml
+++ b/tajo-storage/tajo-storage-hdfs/pom.xml
@@ -34,7 +34,6 @@
UTF-8UTF-8
- 1.8.1
@@ -160,6 +159,26 @@
org.apache.maven.pluginsmaven-surefire-report-plugin
+
+ org.apache.maven.plugins
+ maven-dependency-plugin
+
+
+ copy-dependencies
+ package
+
+ copy-dependencies
+
+
+ runtime
+ ${project.build.directory}/lib
+ false
+ false
+ true
+
+
+
+
@@ -348,6 +367,11 @@
hive-orc${hive.version}
+
+ org.apache.hive
+ hive-storage-api
+ ${hive.version}
+ org.apache.hivehive-serde
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java
index 86fe7ad2de..5b159919de 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java
@@ -26,17 +26,14 @@
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.common.io.DiskRange;
import org.apache.hadoop.io.Text;
import org.apache.orc.*;
import org.apache.orc.Reader.Options;
import org.apache.orc.impl.BufferChunk;
import org.apache.orc.impl.InStream;
-import org.apache.tajo.SessionVars;
import org.apache.tajo.TajoConstants;
import org.apache.tajo.catalog.Schema;
import org.apache.tajo.catalog.TableMeta;
-import org.apache.tajo.conf.TajoConf.ConfVars;
import org.apache.tajo.plan.expr.EvalNode;
import org.apache.tajo.storage.FileScanner;
import org.apache.tajo.storage.StorageConstants;
@@ -360,7 +357,7 @@ private static OrcProto.Footer extractFooter(ByteBuffer bb, int footerAbsPos,
bb.position(footerAbsPos);
bb.limit(footerAbsPos + footerSize);
return OrcProto.Footer.parseFrom(InStream.createCodedInputStream("footer",
- Lists.newArrayList(new BufferChunk(bb, 0)), footerSize, codec, bufferSize));
+ Lists.newArrayList(new BufferChunk(bb, 0)), footerSize, codec, bufferSize));
}
private static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos,
@@ -368,7 +365,7 @@ private static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsP
bb.position(metadataAbsPos);
bb.limit(metadataAbsPos + metadataSize);
return OrcProto.Metadata.parseFrom(InStream.createCodedInputStream("metadata",
- Lists.newArrayList(new BufferChunk(bb, 0)), metadataSize, codec, bufferSize));
+ Lists.newArrayList(new BufferChunk(bb, 0)), metadataSize, codec, bufferSize));
}
/**
From 358b9159e4c40349e7a76e873529e9d064eba171 Mon Sep 17 00:00:00 2001
From: Jihoon Son
Date: Sun, 20 Mar 2016 22:16:56 +0900
Subject: [PATCH 07/16] Fix test failure and cleanup hive catalog dependency.
---
.../tajo-catalog-drivers/tajo-hive/pom.xml | 192 +++++++++++++-----
.../TestSelectQuery/timezoned/timezoned1.tbl | 3 +
tajo-dist/pom.xml | 6 +-
tajo-storage/tajo-storage-hdfs/pom.xml | 1 +
4 files changed, 147 insertions(+), 55 deletions(-)
create mode 100644 tajo-core-tests/src/test/resources/dataset/TestSelectQuery/timezoned/timezoned1.tbl
diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml b/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml
index a1e0c98b57..d8484613fb 100644
--- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml
+++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml
@@ -134,19 +134,35 @@
org.apache.hadoop
- hadoop-mapreduce-client-core
+ hadoop-common${hadoop.version}provided
+
+
+ zookeeper
+ org.apache.zookeeper
+
+ org.apache.hadoop
- hadoop-common
+ hadoop-mapreduce-client-core${hadoop.version}provided
+
+
+ hadoop-yarn-common
+ org.apache.hadoop
+
+
+ netty
+ io.netty
+
+ org.apache.hive
- hive-exec
+ hive-metastore${hive.version}provided
@@ -156,123 +172,195 @@
org.apache.hive
- hive-contrib
+ hive-serdeorg.apache.hive
- hive-hbase-handler
+ hive-shimss
- org.apache.hive
- hive-metastore
+ org.apache.thrift
+ libfb303
- org.apache.hive
- hive-serde
+ org.apache.thrift
+ libthrift
- org.apache.hive
- hive-shims
+ com.jolbox
+ bonecp
- org.apache.hive
- hive-testutils
+ tephra-hbase-compat-1.0
+ co.cask.tephra
- org.apache.thrift
- libfb303
+ tephra-core
+ co.cask.tephra
- org.apache.thrift
- libthrift
+ tephra-api
+ co.cask.tephra
- com.jolbox
- bonecp
+ hbase-client
+ org.apache.hbase
- com.google.protobuf
- protobuf-java
+ hadoop-yarn-server-resourcemanager
+ org.apache.hadoop
- org.apache.calcite
- calcite-core
+ antlr-runtime
+ org.antlr
- org.apache.calcite
- calcite-avatica
+ log4j-slf4j-impl
+ org.apache.logging.log4j
+
+
+ zookeeper
+ org.apache.zookeeperorg.apache.hive
- hive-metastore
+ hive-common${hive.version}provided
- org.apache.hive
- hive-common
+ jetty-all
+ org.eclipse.jetty.aggregate
- org.apache.hive
- hive-serde
+ javax.servlet
+ org.eclipse.jetty.orbit
- org.apache.hive
- hive-shimss
+ joda-time
+ joda-time
- org.apache.thrift
- libfb303
+ jackson-databind
+ com.fasterxml.jackson.core
- org.apache.thrift
- libthrift
+ metrics-json
+ io.dropwizard.metrics
- com.jolbox
- bonecp
+ metrics-jvm
+ io.dropwizard.metrics
+
+
+ metrics-core
+ io.dropwizard.metrics
+
+
+ ant
+ org.apache.ant
+
+
+ json
+ org.json
+
+
+ log4j-slf4j-impl
+ org.apache.logging.log4j
+
+
+ log4j-web
+ org.apache.logging.log4j
+
+
+ log4j-1.2-api
+ org.apache.logging.log4jorg.apache.hive
- hive-cli
+ hive-exec${hive.version}provided
+ hive-antorg.apache.hive
- hive-common
+ hive-llap-tezorg.apache.hive
- hive-exec
- org.apache.hive
- hive-metastore
+ ST4
+ org.antlr
- org.apache.hive
- hive-serde
+ ivy
+ org.apache.ivy
- org.apache.hive
- hive-service
+ curator-framework
+ org.apache.curator
- org.apache.hive
- hive-shims
+ apache-curator
+ org.apache.curator
- com.jolbox
- bonecp
+ groovy-all
+ org.codehaus.groovy
+
+
+ calcite-core
+ org.apache.calcite
+
+
+ calcite-avatica
+ org.apache.calcite
+
+
+ stax-api
+ stax
- jlinejline
+ jline
+
+
+ log4j-1.2-api
+ org.apache.logging.log4j
+
+
+ log4j-slf4j-impl
+ org.apache.logging.log4j
+
+
+ ant
+ org.apache.ant
+
+
+ zookeeper
+ org.apache.zookeeper
+
+
+ antlr-runtime
+ org.antlr
+
+
+
+
+ org.apache.hive
+ hive-serde
+ ${hive.version}
+ provided
+
+
+ opencsv
+ net.sf.opencsv
diff --git a/tajo-core-tests/src/test/resources/dataset/TestSelectQuery/timezoned/timezoned1.tbl b/tajo-core-tests/src/test/resources/dataset/TestSelectQuery/timezoned/timezoned1.tbl
new file mode 100644
index 0000000000..74b2e1b273
--- /dev/null
+++ b/tajo-core-tests/src/test/resources/dataset/TestSelectQuery/timezoned/timezoned1.tbl
@@ -0,0 +1,3 @@
+1980-4-1 01:50:30.010|01:50:30.010|1980-04-01
+80/4/1 1:50:30 AM|1:50:30 AM|80/4/1
+1980 April 1 1:50:30|1:50:30|1980-04-01
\ No newline at end of file
diff --git a/tajo-dist/pom.xml b/tajo-dist/pom.xml
index a91c431a60..b742f5eccf 100644
--- a/tajo-dist/pom.xml
+++ b/tajo-dist/pom.xml
@@ -154,6 +154,9 @@
run cp -r ${project.basedir}/src/main/conf .
run rm -rf lib/tajo-*-${project.version}.jar
+ run mkdir -p lib
+ run cp -r $ROOT/tajo-storage/tajo-storage-hdfs/target/lib/hive-*.jar lib/
+
run mkdir hive
run mv lib/hive-*.jar hive/
@@ -162,9 +165,6 @@
run mkdir -p extlib
- run mkdir -p lib
- run cp -r $ROOT/tajo-storage/tajo-storage-hdfs/target/lib/hive-*.jar lib/
-
echo
echo "Tajo dist layout available at: ${project.build.directory}/tajo-${project.version}"
echo
diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml
index a3a46fe416..2a0396a529 100644
--- a/tajo-storage/tajo-storage-hdfs/pom.xml
+++ b/tajo-storage/tajo-storage-hdfs/pom.xml
@@ -376,6 +376,7 @@
org.apache.hivehive-serde${hive.version}
+ providedlog4j-slf4j-impl
From a79fa2920de301c1e81bc7c5568f4e1c35650429 Mon Sep 17 00:00:00 2001
From: Jihoon Son
Date: Mon, 21 Mar 2016 09:42:37 +0900
Subject: [PATCH 08/16] fix test failure
---
tajo-storage/tajo-storage-hdfs/pom.xml | 1 -
1 file changed, 1 deletion(-)
diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml
index 2a0396a529..a3a46fe416 100644
--- a/tajo-storage/tajo-storage-hdfs/pom.xml
+++ b/tajo-storage/tajo-storage-hdfs/pom.xml
@@ -376,7 +376,6 @@
org.apache.hivehive-serde${hive.version}
- providedlog4j-slf4j-impl
From b67427fc277dc5dde3498bf4b2856a8260afc069 Mon Sep 17 00:00:00 2001
From: Jihoon Son
Date: Mon, 21 Mar 2016 15:07:06 +0900
Subject: [PATCH 09/16] Remove hive shims dependency
---
tajo-storage/tajo-storage-hdfs/pom.xml | 70 +++----
.../apache/tajo/storage/orc/ORCAppender.java | 6 -
.../ObjectInspectorFactory.java | 91 ---------
.../TajoBlobObjectInspector.java | 82 --------
.../TajoBooleanObjectInspector.java | 76 --------
.../TajoDateObjectInspector.java | 73 --------
.../TajoDoubleObjectInspector.java | 76 --------
.../TajoFloatObjectInspector.java | 76 --------
.../TajoIntObjectInspector.java | 76 --------
.../TajoLongObjectInspector.java | 76 --------
.../TajoNullObjectInspector.java | 69 -------
.../TajoShortObjectInspector.java | 76 --------
.../TajoStringObjectInspector.java | 71 -------
.../TajoStructObjectInspector.java | 122 ------------
.../TajoTimestampObjectInspector.java | 73 --------
.../orc/ByteBufferAllocatorPool.java | 102 ++++++++++
.../orc/ByteBufferPoolAdapter.java} | 25 +--
.../tajo/storage/thirdparty/orc/OrcUtils.java | 175 ------------------
.../thirdparty/orc/RecordReaderUtils.java | 96 +---------
.../thirdparty/orc/ZeroCopyAdapter.java | 57 ++++++
20 files changed, 213 insertions(+), 1355 deletions(-)
delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java
delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java
delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java
delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDateObjectInspector.java
delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java
delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java
delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java
delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java
delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoNullObjectInspector.java
delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java
delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java
delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java
delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoTimestampObjectInspector.java
create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ByteBufferAllocatorPool.java
rename tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/{orc/objectinspector/TajoPrimitiveObjectInspector.java => thirdparty/orc/ByteBufferPoolAdapter.java} (62%)
create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZeroCopyAdapter.java
diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml
index a3a46fe416..f940f5b681 100644
--- a/tajo-storage/tajo-storage-hdfs/pom.xml
+++ b/tajo-storage/tajo-storage-hdfs/pom.xml
@@ -372,41 +372,41 @@
hive-storage-api${hive.version}
-
- org.apache.hive
- hive-serde
- ${hive.version}
-
-
- log4j-slf4j-impl
- org.apache.logging.log4j
-
-
- log4j-1.2-api
- org.apache.logging.log4j
-
-
- hive-common
- org.apache.hive
-
-
- libthrift
- org.apache.thrift
-
-
- opencsv
- net.sf.opencsv
-
-
- hadoop-yarn-server-resourcemanager
- org.apache.hadoop
-
-
- hive-shims-scheduler
- org.apache.hive.shims
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java
index b283b2219b..ebdfa3224e 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java
@@ -20,7 +20,6 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.orc.CompressionKind;
import org.apache.orc.OrcConf;
import org.apache.orc.TypeDescription;
@@ -152,7 +151,6 @@ private static CompressionKind getCompressionKind(TableMeta meta) {
*/
public static class WriterOptions extends OrcFile.WriterOptions {
private boolean explicitSchema = false;
- private ObjectInspector inspector = null;
// Setting the default batch size to 1000 makes the memory check at 5000
// rows work the same as the row by row writer. (If it was the default 1024,
// the smallest stripe size would be 5120 rows, which changes the output
@@ -179,10 +177,6 @@ protected WriterOptions batchSize(int maxSize) {
return this;
}
- ObjectInspector getInspector() {
- return inspector;
- }
-
int getBatchSize() {
return batchSize;
}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java
deleted file mode 100644
index 4855ff9fe3..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java
+++ /dev/null
@@ -1,91 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
-import org.apache.tajo.catalog.Schema;
-import org.apache.tajo.common.TajoDataTypes;
-import org.apache.tajo.exception.UnsupportedException;
-
-public class ObjectInspectorFactory {
-
- public static StructObjectInspector buildStructObjectInspector(Schema schema) {
- StructObjectInspector structOI = new TajoStructObjectInspector(schema);
- return structOI;
- }
-
- public static ObjectInspector buildObjectInspectorByType(TajoDataTypes.Type dataType) throws UnsupportedException {
- ObjectInspector oi = null;
-
- switch(dataType) {
- case BOOLEAN:
- oi = new TajoBooleanObjectInspector();
- break;
-
- case INT2:
- oi = new TajoShortObjectInspector();
- break;
-
- case INET4:
- case INT4:
- oi = new TajoIntObjectInspector();
- break;
-
- case INT8:
- oi = new TajoLongObjectInspector();
- break;
-
- case FLOAT4:
- oi = new TajoFloatObjectInspector();
- break;
-
- case FLOAT8:
- oi = new TajoDoubleObjectInspector();
- break;
-
- case TEXT:
- case CHAR:
- oi = new TajoStringObjectInspector();
- break;
-
- case TIMESTAMP:
- oi = new TajoTimestampObjectInspector();
- break;
-
- case DATE:
- oi = new TajoDateObjectInspector();
- break;
-
- case BLOB:
- case PROTOBUF:
- oi = new TajoBlobObjectInspector();
- break;
-
- case NULL_TYPE:
- oi = new TajoNullObjectInspector();
- break;
-
- default:
- throw new UnsupportedException(dataType.name()+" is not supported yet in ORCAppender");
- }
-
- return oi;
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java
deleted file mode 100644
index d241f84371..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-import org.apache.hadoop.io.BytesWritable;
-import org.apache.tajo.datum.Datum;
-
-public class TajoBlobObjectInspector extends TajoPrimitiveObjectInspector implements BinaryObjectInspector {
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.binaryTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.BINARY;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public BytesWritable getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return byte [].class;
- }
-
- @Override
- public byte[] getPrimitiveJavaObject(Object o) {
- return ((Datum)o).asByteArray();
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public int precision() {
- return 0;
- }
-
- @Override
- public int scale() {
- return 0;
- }
-
- @Override
- public String getTypeName() {
- return "BINARY";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java
deleted file mode 100644
index 273505f0cb..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-import org.apache.tajo.datum.Datum;
-
-public class TajoBooleanObjectInspector extends TajoPrimitiveObjectInspector implements BooleanObjectInspector {
- @Override
- public boolean get(Object o) {
- return ((Datum)o).asBool();
- }
-
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.booleanTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.BOOLEAN;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public Object getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return Boolean.class;
- }
-
- @Override
- public Object getPrimitiveJavaObject(Object o) {
- return null;
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public String getTypeName() {
- return "BOOLEAN";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDateObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDateObjectInspector.java
deleted file mode 100644
index f12706b8df..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDateObjectInspector.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-
-import java.sql.Date;
-
-public class TajoDateObjectInspector extends TajoPrimitiveObjectInspector implements DateObjectInspector {
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.dateTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.DATE;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public DateWritable getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return null;
- }
-
- @Override
- public Date getPrimitiveJavaObject(Object o) {
- return null;
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public String getTypeName() {
- return "DATE";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java
deleted file mode 100644
index 6dc1f8c95c..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-import org.apache.tajo.datum.Float8Datum;
-
-public class TajoDoubleObjectInspector extends TajoPrimitiveObjectInspector implements DoubleObjectInspector {
- @Override
- public double get(Object o) {
- return ((Float8Datum)o).asFloat8();
- }
-
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.doubleTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.DOUBLE;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public Object getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return Double.class;
- }
-
- @Override
- public Object getPrimitiveJavaObject(Object o) {
- return null;
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public String getTypeName() {
- return "DOUBLE";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java
deleted file mode 100644
index bed8784fb5..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-import org.apache.tajo.datum.Float4Datum;
-
-public class TajoFloatObjectInspector extends TajoPrimitiveObjectInspector implements DoubleObjectInspector {
- @Override
- public double get(Object o) {
- return ((Float4Datum)o).asFloat4();
- }
-
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.floatTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.FLOAT;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public Object getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return Float.class;
- }
-
- @Override
- public Object getPrimitiveJavaObject(Object o) {
- return null;
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public String getTypeName() {
- return "FLOAT";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java
deleted file mode 100644
index a0c2209678..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-import org.apache.tajo.datum.Int4Datum;
-
-public class TajoIntObjectInspector extends TajoPrimitiveObjectInspector implements IntObjectInspector {
- @Override
- public int get(Object o) {
- return ((Int4Datum)o).asInt4();
- }
-
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.intTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.INT;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public Object getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return Integer.class;
- }
-
- @Override
- public Object getPrimitiveJavaObject(Object o) {
- return null;
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public String getTypeName() {
- return "INT";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java
deleted file mode 100644
index b30b3338f6..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-import org.apache.tajo.datum.Int8Datum;
-
-public class TajoLongObjectInspector extends TajoPrimitiveObjectInspector implements LongObjectInspector {
- @Override
- public long get(Object o) {
- return ((Int8Datum)o).asInt8();
- }
-
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.shortTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.LONG;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public Object getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return Long.class;
- }
-
- @Override
- public Object getPrimitiveJavaObject(Object o) {
- return null;
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public String getTypeName() {
- return "LONG";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoNullObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoNullObjectInspector.java
deleted file mode 100644
index 49998ce30e..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoNullObjectInspector.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-
-public class TajoNullObjectInspector extends TajoPrimitiveObjectInspector {
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.voidTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.VOID;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public Object getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return Void.class;
- }
-
- @Override
- public Object getPrimitiveJavaObject(Object o) {
- return null;
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public String getTypeName() {
- return "NULL";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java
deleted file mode 100644
index d32bee172a..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-import org.apache.tajo.datum.Int2Datum;
-
-public class TajoShortObjectInspector extends TajoPrimitiveObjectInspector implements ShortObjectInspector {
- @Override
- public short get(Object o) {
- return ((Int2Datum)o).asInt2();
- }
-
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.shortTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.SHORT;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public Object getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return Short.class;
- }
-
- @Override
- public Object getPrimitiveJavaObject(Object o) {
- return null;
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public String getTypeName() {
- return "SHORT";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java
deleted file mode 100644
index b9331da6cd..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-import org.apache.hadoop.io.Text;
-
-public class TajoStringObjectInspector extends TajoPrimitiveObjectInspector implements StringObjectInspector {
- @Override
- public PrimitiveTypeInfo getTypeInfo() {
- return TypeInfoFactory.stringTypeInfo;
- }
-
- @Override
- public PrimitiveCategory getPrimitiveCategory() {
- return PrimitiveCategory.STRING;
- }
-
- @Override
- public Class> getPrimitiveWritableClass() {
- return null;
- }
-
- @Override
- public Text getPrimitiveWritableObject(Object o) {
- return null;
- }
-
- @Override
- public Class> getJavaPrimitiveClass() {
- return null;
- }
-
- @Override
- public String getPrimitiveJavaObject(Object o) {
- return null;
- }
-
- @Override
- public Object copyObject(Object o) {
- return null;
- }
-
- @Override
- public boolean preferWritable() {
- return false;
- }
-
- @Override
- public String getTypeName() {
- return "STRING";
- }
-}
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java
deleted file mode 100644
index 7521fa32c6..0000000000
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java
+++ /dev/null
@@ -1,122 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tajo.storage.orc.objectinspector;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.StructField;
-import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
-import org.apache.tajo.catalog.Column;
-import org.apache.tajo.catalog.Schema;
-import org.apache.tajo.exception.UnsupportedException;
-
-import java.util.ArrayList;
-import java.util.List;
-
-public class TajoStructObjectInspector extends StructObjectInspector {
- private final static Log LOG = LogFactory.getLog(TajoStructObjectInspector.class);
- private List structFields;
-
- static class TajoStructField implements StructField {
- private String name;
- private ObjectInspector oi;
- private String comment;
-
- TajoStructField(String name, ObjectInspector oi) {
- this(name, oi, null);
- }
-
- TajoStructField(String name, ObjectInspector oi, String comment) {
- this.name = name;
- this.oi = oi;
- this.comment = comment;
- }
-
- @Override
- public String getFieldName() {
- return name;
- }
-
- @Override
- public ObjectInspector getFieldObjectInspector() {
- return oi;
- }
-
- @Override
- public int getFieldID() {
- return 0;
- }
-
- @Override
- public String getFieldComment() {
- return comment;
- }
- }
-
- TajoStructObjectInspector(Schema schema) {
- structFields = new ArrayList<>(schema.size());
-
- for (Column c: schema.getRootColumns()) {
- try {
- TajoStructField field = new TajoStructField(c.getSimpleName(),
- ObjectInspectorFactory.buildObjectInspectorByType(c.getDataType().getType()));
- structFields.add(field);
- } catch (UnsupportedException e) {
- LOG.error(e.getMessage());
- }
- }
- }
-
- @Override
- public List extends StructField> getAllStructFieldRefs() {
- return structFields;
- }
-
- @Override
- public StructField getStructFieldRef(String s) {
- for (TajoStructField field:structFields) {
- if (field.getFieldName().equals(s)) {
- return field;
- }
- }
-
- return null;
- }
-
- @Override
- public Object getStructFieldData(Object o, StructField structField) {
- return null;
- }
-
- @Override
- public List