From 8b6f8c27285bffaf49bc3f86d57720e1f68aceda Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Fri, 18 Mar 2016 18:12:19 +0900 Subject: [PATCH 01/16] TAJO-2102 --- .../tajo/catalog/store/HiveCatalogStore.java | 1 + .../catalog/store/TestHiveCatalogStore.java | 1 + .../org/apache/tajo/cli/tools/TajoDump.java | 2 +- .../org/apache/tajo/datum/TimestampDatum.java | 2 +- .../java/org/apache/tajo/unit/TimeUnit.java | 2 + tajo-project/pom.xml | 2 +- tajo-storage/tajo-storage-hdfs/pom.xml | 52 + .../apache/tajo/storage/orc/ORCScanner.java | 332 ---- .../apache/tajo/storage/orc/OrcScanner.java | 397 +++++ .../ObjectInspectorFactory.java | 2 +- .../thirdparty/orc/MetadataReader.java | 128 ++ .../tajo/storage/thirdparty/orc/OrcFile.java | 2 +- .../thirdparty/orc/OrcRecordReader.java | 460 +++++ .../tajo/storage/thirdparty/orc/OrcUtils.java | 35 + .../thirdparty/orc/RecordReaderUtils.java | 479 +++++ .../thirdparty/orc/TreeReaderFactory.java | 1576 +++++++++++++++++ .../storage/thirdparty/orc/WriterImpl.java | 6 +- .../tajo/storage/TestCompressionStorages.java | 13 +- .../org/apache/tajo/storage/TestStorages.java | 69 +- .../src/test/resources/storage-default.xml | 2 +- 20 files changed, 3196 insertions(+), 367 deletions(-) delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MetadataReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RecordReaderUtils.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java index 63f18b6f75..1d0d261d12 100644 --- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java +++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java @@ -44,6 +44,7 @@ import org.apache.tajo.algebra.IsNullPredicate; import org.apache.tajo.algebra.JsonHelper; import org.apache.tajo.catalog.*; +import org.apache.tajo.catalog.TableMeta; import org.apache.tajo.catalog.partition.PartitionMethodDesc; import org.apache.tajo.catalog.proto.CatalogProtos; import org.apache.tajo.catalog.proto.CatalogProtos.*; diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java index 7e1a3a4ff6..6bb66a1a46 100644 --- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java +++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java @@ -78,6 +78,7 @@ public static void setUp() throws Exception { conf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, warehousePath.toUri().toString()); conf.set(HiveConf.ConfVars.METASTORECONNECTURLKEY.varname, jdbcUri); conf.set(TajoConf.ConfVars.WAREHOUSE_DIR.varname, warehousePath.toUri().toString()); + conf.setBoolean("datanucleus.schema.autoCreateAll", true); // TODO: check this is valid // create local HiveCatalogStore. TajoConf tajoConf = new TajoConf(conf); diff --git a/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoDump.java b/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoDump.java index 4df418f5be..c9fa2b488c 100644 --- a/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoDump.java +++ b/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoDump.java @@ -208,7 +208,7 @@ private static void dumpDatabase(TajoClient client, String databaseName, PrintWr } } writer.write("\n\n"); - } catch (Exception e) { + } catch (Throwable e) { // dump for each table can throw any exception. We need to skip the exception case. // here, the error message prints out via stderr. System.err.println("ERROR:" + tableName + "," + e.getMessage()); diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java index 5b4c152a51..f69e7da2f4 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java @@ -125,7 +125,7 @@ public String toString() { /** * - * @param tm TimeMEta + * @param tm TimeMeta * @param timeZone Timezone * @param includeTimeZone Add timezone if it is true. It is usually used for TIMEZONEZ * @return A timestamp string diff --git a/tajo-common/src/main/java/org/apache/tajo/unit/TimeUnit.java b/tajo-common/src/main/java/org/apache/tajo/unit/TimeUnit.java index 8062f2de5a..a03a930d78 100644 --- a/tajo-common/src/main/java/org/apache/tajo/unit/TimeUnit.java +++ b/tajo-common/src/main/java/org/apache/tajo/unit/TimeUnit.java @@ -26,4 +26,6 @@ public class TimeUnit { public static final int DAY = HOUR * 24; public static final int PART_UNIT = 5*TimeUnit.MIN; + + public static final int MILLIS_PER_SECOND = 1000; } diff --git a/tajo-project/pom.xml b/tajo-project/pom.xml index cd86d3b350..16e1eb074f 100644 --- a/tajo-project/pom.xml +++ b/tajo-project/pom.xml @@ -36,7 +36,7 @@ 2.7.2 2.5.0 1.1.1 - 1.1.0 + 2.0.0 4.0.34.Final 2.6 6.1.26 diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index 5f66395e94..2c4538a6ba 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -349,6 +349,58 @@ presto-orc 0.141 + + org.apache.hive + hive-orc + ${hive.version} + + + org.apache.hive + hive-serde + ${hive.version} + + + log4j-slf4j-impl + org.apache.logging.log4j + + + log4j-1.2-api + org.apache.logging.log4j + + + + + org.apache.hive + hive-exec + ${hive.version} + + + log4j-1.2-api + org.apache.logging.log4j + + + log4j-slf4j-impl + org.apache.logging.log4j + + + antlr-runtime + org.antlr + + + jline + jline + + + calcite-core + org.apache.calcite + + + calcite-avatica + org.apache.calcite + + + + diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java deleted file mode 100644 index 0a4ebc6948..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java +++ /dev/null @@ -1,332 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.orc; - -import com.facebook.presto.orc.OrcDataSource; -import com.facebook.presto.orc.OrcPredicate; -import com.facebook.presto.orc.OrcReader; -import com.facebook.presto.orc.OrcRecordReader; -import com.facebook.presto.orc.memory.AggregatedMemoryContext; -import com.facebook.presto.orc.metadata.OrcMetadataReader; -import com.facebook.presto.spi.block.Block; -import com.facebook.presto.spi.type.*; -import com.google.protobuf.InvalidProtocolBufferException; -import io.airlift.units.DataSize; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.tajo.TajoConstants; -import org.apache.tajo.catalog.Schema; -import org.apache.tajo.catalog.TableMeta; -import org.apache.tajo.common.TajoDataTypes; -import org.apache.tajo.conf.TajoConf; -import org.apache.tajo.datum.*; -import org.apache.tajo.exception.NotImplementedException; -import org.apache.tajo.exception.TajoRuntimeException; -import org.apache.tajo.plan.expr.EvalNode; -import org.apache.tajo.storage.FileScanner; -import org.apache.tajo.storage.StorageConstants; -import org.apache.tajo.storage.Tuple; -import org.apache.tajo.storage.VTuple; -import org.apache.tajo.storage.fragment.Fragment; -import org.apache.tajo.storage.thirdparty.orc.HdfsOrcDataSource; -import org.apache.tajo.util.datetime.DateTimeUtil; -import org.joda.time.DateTimeZone; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.TimeZone; - -/** - * OrcScanner for reading ORC files - */ -public class ORCScanner extends FileScanner { - private static final Log LOG = LogFactory.getLog(ORCScanner.class); - private OrcRecordReader recordReader; - private Block[] blocks; - private int currentPosInBatch = 0; - private int batchSize = 0; - private Tuple outTuple; - private AggregatedMemoryContext aggrMemoryContext = new AggregatedMemoryContext(); - - public ORCScanner(Configuration conf, final Schema schema, final TableMeta meta, final Fragment fragment) { - super(conf, schema, meta, fragment); - } - - private FileSystem fs; - private FSDataInputStream fis; - - private static class ColumnInfo { - TajoDataTypes.DataType type; - int id; - } - - /** - * Temporary array for caching column info - */ - private ColumnInfo [] targetColInfo; - - @Override - public void init() throws IOException { - OrcReader orcReader; - DataSize maxMergeDistance = new DataSize(Double.parseDouble(meta.getProperty(StorageConstants.ORC_MAX_MERGE_DISTANCE, - StorageConstants.DEFAULT_ORC_MAX_MERGE_DISTANCE)), DataSize.Unit.BYTE); - DataSize maxReadSize = new DataSize(Double.parseDouble(meta.getProperty(StorageConstants.ORC_MAX_READ_BUFFER_SIZE, - StorageConstants.DEFAULT_ORC_MAX_READ_BUFFER_SIZE)), DataSize.Unit.BYTE); - - if (targets == null) { - targets = schema.toArray(); - } - - outTuple = new VTuple(targets.length); - - Path path = fragment.getPath(); - - if(fs == null) { - fs = FileScanner.getFileSystem((TajoConf)conf, path); - } - - if(fis == null) { - fis = fs.open(path); - } - - OrcDataSource orcDataSource = new HdfsOrcDataSource( - this.fragment.getPath().toString(), - fis, - fs.getFileStatus(path).getLen(), - maxMergeDistance, - maxReadSize); - - targetColInfo = new ColumnInfo[targets.length]; - for (int i=0; i columnMap = new HashMap<>(); - for (ColumnInfo colInfo: targetColInfo) { - columnMap.put(colInfo.id, createFBtypeByTajoType(colInfo.type)); - } - - orcReader = new OrcReader(orcDataSource, new OrcMetadataReader(), maxMergeDistance, maxReadSize); - - TimeZone timezone = TimeZone.getTimeZone(meta.getProperty(StorageConstants.TIMEZONE, - TajoConstants.DEFAULT_SYSTEM_TIMEZONE)); - - // TODO: make OrcPredicate useful - // presto-orc uses joda timezone, so it needs to be converted. - recordReader = orcReader.createRecordReader(columnMap, OrcPredicate.TRUE, - fragment.getStartKey(), fragment.getLength(), DateTimeZone.forTimeZone(timezone), aggrMemoryContext); - - super.init(); - LOG.debug("file fragment { path: " + fragment.getPath() + - ", start offset: " + fragment.getStartKey() + - ", length: " + fragment.getLength() + "}"); - } - - @Override - public Tuple next() throws IOException { - if (currentPosInBatch == batchSize) { - getNextBatch(); - - // EOF - if (batchSize == -1) { - return null; - } - } - - for (int i=0; i stripeStats; + private int metadataSize; + protected List types; + private List userMetadata; + private List fileStats; + private List stripes; + protected int rowIndexStride; + private long contentLength, numberOfRows; + + private List versionList; + + //serialized footer - Keeping this around for use by getFileMetaInfo() + // will help avoid cpu cycles spend in deserializing at cost of increased + // memory footprint. + private ByteBuffer footerByteBuffer; + // Same for metastore cache - maintains the same background buffer, but includes postscript. + // This will only be set if the file footer/metadata was read from disk. + private ByteBuffer footerMetaAndPsBuffer; + + private OrcRecordReader recordReader; + + private long recordCount = 0; + + /** + * Ensure this is an ORC file to prevent users from trying to read text + * files or RC files as ORC files. + * @param in the file being read + * @param path the filename for error messages + * @param psLen the postscript length + * @param buffer the tail of the file + * @throws IOException + */ + static void ensureOrcFooter(FSDataInputStream in, + Path path, + int psLen, + ByteBuffer buffer) throws IOException { + int len = OrcFile.MAGIC.length(); + if (psLen < len + 1) { + throw new IOException("Malformed ORC file " + path + + ". Invalid postscript length " + psLen); + } + int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - 1 - len; + byte[] array = buffer.array(); + // now look for the magic string at the end of the postscript. + if (!Text.decode(array, offset, len).equals(OrcFile.MAGIC)) { + // If it isn't there, this may be the 0.11.0 version of ORC. + // Read the first 3 bytes of the file to check for the header + byte[] header = new byte[len]; + in.readFully(0, header, 0, len); + // if it isn't there, this isn't an ORC file + if (!Text.decode(header, 0 , len).equals(OrcFile.MAGIC)) { + throw new IOException("Malformed ORC file " + path + + ". Invalid postscript."); + } + } + } + + /** + * Build a version string out of an array. + * @param version the version number as a list + * @return the human readable form of the version string + */ + private static String versionString(List version) { + StringBuilder buffer = new StringBuilder(); + for(int i=0; i < version.size(); ++i) { + if (i != 0) { + buffer.append('.'); + } + buffer.append(version.get(i)); + } + return buffer.toString(); + } + + /** + * Check to see if this ORC file is from a future version and if so, + * warn the user that we may not be able to read all of the column encodings. + * @param log the logger to write any error message to + * @param path the data source path for error messages + * @param version the version of hive that wrote the file. + */ + static void checkOrcVersion(Log log, Path path, List version) { + if (version.size() >= 1) { + int major = version.get(0); + int minor = 0; + if (version.size() >= 2) { + minor = version.get(1); + } + if (major > OrcFile.Version.CURRENT.getMajor() || + (major == OrcFile.Version.CURRENT.getMajor() && + minor > OrcFile.Version.CURRENT.getMinor())) { + log.warn(path + " was written by a future Hive version " + + versionString(version) + + ". This file may not be readable by this version of Hive."); + } + } + } + + public OrcScanner(Configuration conf, Schema schema, TableMeta meta, Fragment fragment) throws IOException { + super(conf, schema, meta, fragment); + + this.path = this.fragment.getPath(); + this.fileSystem = this.path.getFileSystem(conf); + } + + private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs, + Path path, + long maxFileLength + ) throws IOException { + FSDataInputStream file = fs.open(path); + + // figure out the size of the file using the option or filesystem + long size; + if (maxFileLength == Long.MAX_VALUE) { + size = fs.getFileStatus(path).getLen(); + } else { + size = maxFileLength; + } + + //read last bytes into buffer to get PostScript + int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS); + ByteBuffer buffer = ByteBuffer.allocate(readSize); + assert buffer.position() == 0; + file.readFully((size - readSize), + buffer.array(), buffer.arrayOffset(), readSize); + buffer.position(0); + + //read the PostScript + //get length of PostScript + int psLen = buffer.get(readSize - 1) & 0xff; + ensureOrcFooter(file, path, psLen, buffer); + int psOffset = readSize - 1 - psLen; + OrcProto.PostScript ps = extractPostScript(buffer, path, psLen, psOffset); + + int footerSize = (int) ps.getFooterLength(); + int metadataSize = (int) ps.getMetadataLength(); + + //check if extra bytes need to be read + ByteBuffer fullFooterBuffer = null; + int extra = Math.max(0, psLen + 1 + footerSize + metadataSize - readSize); + if (extra > 0) { + //more bytes need to be read, seek back to the right place and read extra bytes + ByteBuffer extraBuf = ByteBuffer.allocate(extra + readSize); + file.readFully((size - readSize - extra), extraBuf.array(), + extraBuf.arrayOffset() + extraBuf.position(), extra); + extraBuf.position(extra); + //append with already read bytes + extraBuf.put(buffer); + buffer = extraBuf; + buffer.position(0); + fullFooterBuffer = buffer.slice(); + buffer.limit(footerSize + metadataSize); + } else { + //footer is already in the bytes in buffer, just adjust position, length + buffer.position(psOffset - footerSize - metadataSize); + fullFooterBuffer = buffer.slice(); + buffer.limit(psOffset); + } + + // remember position for later + buffer.mark(); + + file.close(); + + return new FileMetaInfo( + ps.getCompression().toString(), + (int) ps.getCompressionBlockSize(), + (int) ps.getMetadataLength(), + buffer, + ps.getVersionList(), + org.apache.orc.OrcFile.WriterVersion.FUTURE, + fullFooterBuffer + ); + } + + public OrcRecordReader getRecordReader() throws IOException { + boolean skipCorruptRecords = conf.getBoolean("orc.skip.corrupt-records", false); + + return new OrcRecordReader(meta, this.stripes, fileSystem, schema, targets, fragment, + skipCorruptRecords, types, codec, bufferSize, rowIndexStride, conf); + } + + @Override + public void init() throws IOException { + super.init(); + + FileMetaInfo footerMetaData = extractMetaInfoFromFooter(fileSystem, path, maxLength); + this.footerMetaAndPsBuffer = footerMetaData.footerMetaAndPsBuffer; + MetaInfoObjExtractor rInfo = + new MetaInfoObjExtractor(footerMetaData.compressionType, + footerMetaData.bufferSize, + footerMetaData.metadataSize, + footerMetaData.footerBuffer + ); + this.footerByteBuffer = footerMetaData.footerBuffer; + this.compressionKind = rInfo.compressionKind; + this.codec = rInfo.codec; + this.bufferSize = rInfo.bufferSize; + this.metadataSize = rInfo.metadataSize; + this.stripeStats = rInfo.metadata.getStripeStatsList(); + this.types = rInfo.footer.getTypesList(); + this.rowIndexStride = rInfo.footer.getRowIndexStride(); + this.contentLength = rInfo.footer.getContentLength(); + this.numberOfRows = rInfo.footer.getNumberOfRows(); + this.userMetadata = rInfo.footer.getMetadataList(); + this.fileStats = rInfo.footer.getStatisticsList(); + this.versionList = footerMetaData.versionList; + this.stripes = rInfo.footer.getStripesList(); + + recordReader = getRecordReader(); + } + + @Override + public Tuple next() throws IOException { + Tuple next = recordReader.next(); + if (next != null) { + recordCount++; + } + return next; + } + + @Override + public void reset() throws IOException { + // TODO: improve this + this.close(); + recordReader = getRecordReader(); + } + + @Override + public void close() throws IOException { + if (recordReader != null) { + recordReader.close(); + tableStats.setNumBytes(recordReader.getNumBytes()); + tableStats.setNumRows(recordCount); + } + } + + @Override + public boolean isProjectable() { + return true; + } + + @Override + public boolean isSelectable() { + return false; + } + + @Override + public void setFilter(EvalNode filter) { + // TODO: implement this + } + + @Override + public float getProgress() { + return inited ? recordReader.getProgress() : super.getProgress(); + } + + @Override + public boolean isSplittable() { + return true; + } + + private static OrcProto.PostScript extractPostScript(ByteBuffer bb, Path path, + int psLen, int psAbsOffset) throws IOException { + // TODO: when PB is upgraded to 2.6, newInstance(ByteBuffer) method should be used here. + assert bb.hasArray(); + CodedInputStream in = CodedInputStream.newInstance( + bb.array(), bb.arrayOffset() + psAbsOffset, psLen); + OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(in); + checkOrcVersion(LOG, path, ps.getVersionList()); + + // Check compression codec. + switch (ps.getCompression()) { + case NONE: + break; + case ZLIB: + break; + case SNAPPY: + break; + case LZO: + break; + default: + throw new IllegalArgumentException("Unknown compression"); + } + return ps; + } + + private static OrcProto.Footer extractFooter(ByteBuffer bb, int footerAbsPos, + int footerSize, CompressionCodec codec, int bufferSize) throws IOException { + bb.position(footerAbsPos); + bb.limit(footerAbsPos + footerSize); + return OrcProto.Footer.parseFrom(InStream.createCodedInputStream("footer", + Lists.newArrayList(new BufferChunk(bb, 0)), footerSize, codec, bufferSize)); + } + + private static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos, + int metadataSize, CompressionCodec codec, int bufferSize) throws IOException { + bb.position(metadataAbsPos); + bb.limit(metadataAbsPos + metadataSize); + return OrcProto.Metadata.parseFrom(InStream.createCodedInputStream("metadata", + Lists.newArrayList(new BufferChunk(bb, 0)), metadataSize, codec, bufferSize)); + } + + /** + * MetaInfoObjExtractor - has logic to create the values for the fields in ReaderImpl + * from serialized fields. + * As the fields are final, the fields need to be initialized in the constructor and + * can't be done in some helper function. So this helper class is used instead. + * + */ + private static class MetaInfoObjExtractor{ + final org.apache.orc.CompressionKind compressionKind; + final CompressionCodec codec; + final int bufferSize; + final int metadataSize; + final OrcProto.Metadata metadata; + final OrcProto.Footer footer; + + MetaInfoObjExtractor(String codecStr, int bufferSize, int metadataSize, + ByteBuffer footerBuffer) throws IOException { + + this.compressionKind = org.apache.orc.CompressionKind.valueOf(codecStr); + this.bufferSize = bufferSize; + this.codec = OrcUtils.createCodec(compressionKind); + this.metadataSize = metadataSize; + + int position = footerBuffer.position(); + int footerBufferSize = footerBuffer.limit() - footerBuffer.position() - metadataSize; + + this.metadata = extractMetadata(footerBuffer, position, metadataSize, codec, bufferSize); + this.footer = extractFooter( + footerBuffer, position + metadataSize, footerBufferSize, codec, bufferSize); + + footerBuffer.position(position); + } + } + +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java index 061ba0d034..4855ff9fe3 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java @@ -83,7 +83,7 @@ public static ObjectInspector buildObjectInspectorByType(TajoDataTypes.Type data break; default: - throw new UnsupportedException(dataType.name()+" is not supported yet in OrcAppender"); + throw new UnsupportedException(dataType.name()+" is not supported yet in ORCAppender"); } return oi; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MetadataReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MetadataReader.java new file mode 100644 index 0000000000..a3685a7240 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MetadataReader.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.collect.Lists; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.io.DiskRange; +import org.apache.orc.CompressionCodec; +import org.apache.orc.OrcProto; +import org.apache.orc.StripeInformation; +import org.apache.orc.impl.BufferChunk; +import org.apache.orc.impl.InStream; +import org.apache.orc.impl.OrcIndex; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; + +public class MetadataReader implements Closeable { + + private final FSDataInputStream file; + private final CompressionCodec codec; + private final int bufferSize; + private final int typeCount; + + public MetadataReader(FileSystem fileSystem, Path path, + CompressionCodec codec, int bufferSize, int typeCount) throws IOException { + this(fileSystem.open(path), codec, bufferSize, typeCount); + } + + public MetadataReader(FSDataInputStream file, + CompressionCodec codec, int bufferSize, int typeCount) { + this.file = file; + this.codec = codec; + this.bufferSize = bufferSize; + this.typeCount = typeCount; + } + + public OrcIndex readRowIndex(OrcProto.StripeInformation stripe, + OrcProto.StripeFooter footer, boolean[] included, OrcProto.RowIndex[] indexes, + boolean[] sargColumns, OrcProto.BloomFilterIndex[] bloomFilterIndices) throws IOException { + if (footer == null) { + footer = readStripeFooter(stripe); + } + if (indexes == null) { + indexes = new OrcProto.RowIndex[typeCount]; + } + if (bloomFilterIndices == null) { + bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount]; + } + long offset = stripe.getOffset(); + List streams = footer.getStreamsList(); + for (int i = 0; i < streams.size(); i++) { + OrcProto.Stream stream = streams.get(i); + OrcProto.Stream nextStream = null; + if (i < streams.size() - 1) { + nextStream = streams.get(i+1); + } + int col = stream.getColumn(); + int len = (int) stream.getLength(); + // row index stream and bloom filter are interlaced, check if the sarg column contains bloom + // filter and combine the io to read row index and bloom filters for that column together + if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.ROW_INDEX)) { + boolean readBloomFilter = false; + if (sargColumns != null && sargColumns[col] && + nextStream.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER) { + len += nextStream.getLength(); + i += 1; + readBloomFilter = true; + } + if ((included == null || included[col]) && indexes[col] == null) { + byte[] buffer = new byte[len]; + file.readFully(offset, buffer, 0, buffer.length); + ByteBuffer bb = ByteBuffer.wrap(buffer); + indexes[col] = OrcProto.RowIndex.parseFrom(InStream.create("index", + Lists.newArrayList(new BufferChunk(bb, 0)), stream.getLength(), + codec, bufferSize)); + if (readBloomFilter) { + bb.position((int) stream.getLength()); + bloomFilterIndices[col] = OrcProto.BloomFilterIndex.parseFrom(InStream.create( + "bloom_filter", Lists.newArrayList(new BufferChunk(bb, 0)), + nextStream.getLength(), codec, bufferSize)); + } + } + } + offset += len; + } + + OrcIndex index = new OrcIndex(indexes, bloomFilterIndices); + return index; + } + + public OrcProto.StripeFooter readStripeFooter(OrcProto.StripeInformation stripe) throws IOException { + long offset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength(); + int tailLength = (int) stripe.getFooterLength(); + + // read the footer + ByteBuffer tailBuf = ByteBuffer.allocate(tailLength); + file.readFully(offset, tailBuf.array(), tailBuf.arrayOffset(), tailLength); + return OrcProto.StripeFooter.parseFrom(InStream.createCodedInputStream("footer", + Lists.newArrayList(new BufferChunk(tailBuf, 0)), + tailLength, codec, bufferSize)); + } + + @Override + public void close() throws IOException { + file.close(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java index a291953981..b3d9d30795 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java @@ -52,7 +52,7 @@ public final class OrcFile { */ public static enum Version { V_0_11("0.11", 0, 11), - V_0_12("0.12", 0, 12); + V_0_12("0.12", 0, 12); public static final Version CURRENT = V_0_12; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java new file mode 100644 index 0000000000..18a602bd34 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java @@ -0,0 +1,460 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.io.DiskRange; +import org.apache.hadoop.hive.common.io.DiskRangeList; +import org.apache.orc.CompressionCodec; +import org.apache.orc.DataReader; +import org.apache.orc.OrcProto; +import org.apache.orc.impl.*; +import org.apache.orc.impl.StreamName; +import org.apache.tajo.catalog.Column; +import org.apache.tajo.catalog.Schema; +import org.apache.tajo.catalog.TableMeta; +import org.apache.tajo.storage.Tuple; +import org.apache.tajo.storage.VTuple; +import org.apache.tajo.storage.fragment.FileFragment; +import org.apache.tajo.storage.thirdparty.orc.TreeReaderFactory.DatumTreeReader; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class OrcRecordReader implements Closeable { + + private final Log LOG = LogFactory.getLog(OrcRecordReader.class); + + private final Path path; + private final long firstRow; + private final List stripes = new ArrayList<>(); + private OrcProto.StripeFooter stripeFooter; + private final long totalRowCount; + private final CompressionCodec codec; + private final List types; + private final int bufferSize; + private final boolean[] included; + private final long rowIndexStride; + private long rowInStripe = 0; + private int currentStripe = -1; + private long rowBaseInStripe = 0; + private long rowCountInStripe = 0; + private final Map streams = new HashMap<>(); + DiskRangeList bufferChunks = null; + private final TreeReaderFactory.DatumTreeReader[] reader; + private final OrcProto.RowIndex[] indexes; + private final OrcProto.BloomFilterIndex[] bloomFilterIndices; + private final Configuration conf; + private final org.apache.tajo.storage.thirdparty.orc.MetadataReader metadata; + private final DataReader dataReader; + private final Tuple result; + + public OrcRecordReader(TableMeta meta, + List stripes, + FileSystem fileSystem, + Schema schema, + Column[] target, + FileFragment fragment, + boolean skipCorruptRecords, + List types, + CompressionCodec codec, + int bufferSize, + long strideRate, + Configuration conf + ) throws IOException { + + result = new VTuple(target.length); + + this.conf = conf; + this.path = fragment.getPath(); + this.codec = codec; + this.types = types; + this.bufferSize = bufferSize; + this.included = new boolean[schema.size() + 1]; + included[0] = target.length > 0; // always include root column except when target schema size is 0 + Schema targetSchema = new Schema(target); + for (int i = 1; i < included.length; i++) { + included[i] = targetSchema.contains(schema.getColumn(i - 1)); + } + this.rowIndexStride = strideRate; + this.metadata = new org.apache.tajo.storage.thirdparty.orc.MetadataReader(fileSystem, path, codec, bufferSize, types.size()); + + long rows = 0; + long skippedRows = 0; + long offset = fragment.getStartKey(); + long maxOffset = fragment.getStartKey() + fragment.getLength(); + for(OrcProto.StripeInformation stripe: stripes) { + long stripeStart = stripe.getOffset(); + if (offset > stripeStart) { + skippedRows += stripe.getNumberOfRows(); + } else if (stripeStart < maxOffset) { + this.stripes.add(stripe); + rows += stripe.getNumberOfRows(); + } + } + + // TODO: we could change the ctor to pass this externally + this.dataReader = RecordReaderUtils.createDefaultDataReader(fileSystem, path, true, codec); + this.dataReader.open(); + + firstRow = skippedRows; + totalRowCount = rows; + Boolean skipCorrupt = skipCorruptRecords; + + reader = new DatumTreeReader[target.length]; + for (int i = 0; i < reader.length; i++) { + reader[i] = TreeReaderFactory.createTreeReader(meta, schema.getColumnId(target[i].getQualifiedName()), target[i], + skipCorrupt); + } + + indexes = new OrcProto.RowIndex[types.size()]; + bloomFilterIndices = new OrcProto.BloomFilterIndex[types.size()]; + advanceToNextRow(reader, 0L, true); + } + + /** + * Plan the ranges of the file that we need to read given the list of + * columns and row groups. + * + * @param streamList the list of streams available + * @param includedColumns which columns are needed + * @param doMergeBuffers + * @return the list of disk ranges that will be loaded + */ + static DiskRangeList planReadPartialDataStreams + (List streamList, + boolean[] includedColumns, + boolean doMergeBuffers) { + long offset = 0; + // figure out which columns have a present stream + DiskRangeList.CreateHelper list = new DiskRangeList.CreateHelper(); + for (OrcProto.Stream stream : streamList) { + long length = stream.getLength(); + int column = stream.getColumn(); + OrcProto.Stream.Kind streamKind = stream.getKind(); + // since stream kind is optional, first check if it exists + if (stream.hasKind() && + (org.apache.orc.impl.StreamName.getArea(streamKind) == org.apache.orc.impl.StreamName.Area.DATA) && + includedColumns[column]) { + RecordReaderUtils.addEntireStreamToRanges(offset, length, list, doMergeBuffers); + } + offset += length; + } + return list.extract(); + } + + void createStreams(List streamDescriptions, + DiskRangeList ranges, + boolean[] includeColumn, + CompressionCodec codec, + int bufferSize, + Map streams) throws IOException { + long streamOffset = 0; + for (OrcProto.Stream streamDesc : streamDescriptions) { + int column = streamDesc.getColumn(); + if ((includeColumn != null && !includeColumn[column]) || + streamDesc.hasKind() && + (org.apache.orc.impl.StreamName.getArea(streamDesc.getKind()) != org.apache.orc.impl.StreamName.Area.DATA)) { + streamOffset += streamDesc.getLength(); + continue; + } + List buffers = RecordReaderUtils.getStreamBuffers( + ranges, streamOffset, streamDesc.getLength()); + org.apache.orc.impl.StreamName name = new StreamName(column, streamDesc.getKind()); + streams.put(name, InStream.create(name.toString(), buffers, + streamDesc.getLength(), codec, bufferSize)); + streamOffset += streamDesc.getLength(); + } + } + + private void readPartialDataStreams(OrcProto.StripeInformation stripe) throws IOException { + List streamList = stripeFooter.getStreamsList(); + DiskRangeList toRead = planReadPartialDataStreams(streamList, included, true); + if (LOG.isDebugEnabled()) { + LOG.debug("chunks = " + RecordReaderUtils.stringifyDiskRanges(toRead)); + } + bufferChunks = dataReader.readFileData(toRead, stripe.getOffset(), false); + if (LOG.isDebugEnabled()) { + LOG.debug("merge = " + RecordReaderUtils.stringifyDiskRanges(bufferChunks)); + } + + createStreams(streamList, bufferChunks, included, codec, bufferSize, streams); + } + + /** + * Skip over rows that we aren't selecting, so that the next row is + * one that we will read. + * + * @param nextRow the row we want to go to + * @throws IOException + */ + private boolean advanceToNextRow( + TreeReaderFactory.TreeReader[] reader, long nextRow, boolean canAdvanceStripe) + throws IOException { + long nextRowInStripe = nextRow - rowBaseInStripe; + + if (nextRowInStripe >= rowCountInStripe) { + if (canAdvanceStripe) { + advanceStripe(); + } + return canAdvanceStripe; + } + if (nextRowInStripe != rowInStripe) { + if (rowIndexStride != 0) { + int rowGroup = (int) (nextRowInStripe / rowIndexStride); + seekToRowEntry(reader, rowGroup); + for (TreeReaderFactory.TreeReader eachReader : reader) { + eachReader.skipRows(nextRowInStripe - rowGroup * rowIndexStride); + } + } else { + for (TreeReaderFactory.TreeReader eachReader : reader) { + eachReader.skipRows(nextRowInStripe - rowInStripe); + } + } + rowInStripe = nextRowInStripe; + } + return true; + } + + public boolean hasNext() throws IOException { + return rowInStripe < rowCountInStripe; + } + + public Tuple next() throws IOException { + if (hasNext()) { + try { + for (int i = 0; i < reader.length; i++) { + result.put(i, reader[i].next()); + } + // find the next row + rowInStripe += 1; + advanceToNextRow(reader, rowInStripe + rowBaseInStripe, true); + return result; + } catch (IOException e) { + // Rethrow exception with file name in log message + throw new IOException("Error reading file: " + path, e); + } + } else { + return null; + } + } + + /** + * Read the next stripe until we find a row that we don't skip. + * + * @throws IOException + */ + private void advanceStripe() throws IOException { + rowInStripe = rowCountInStripe; + while (rowInStripe >= rowCountInStripe && + currentStripe < stripes.size() - 1) { + currentStripe += 1; + readStripe(); + } + } + + /** + * Read the current stripe into memory. + * + * @throws IOException + */ + private void readStripe() throws IOException { + OrcProto.StripeInformation stripe = beginReadStripe(); + + // if we haven't skipped the whole stripe, read the data + if (rowInStripe < rowCountInStripe) { + // if we aren't projecting columns or filtering rows, just read it all + if (included == null) { + readAllDataStreams(stripe); + } else { + readPartialDataStreams(stripe); + } + + for (TreeReaderFactory.TreeReader eachReader : reader) { + eachReader.startStripe(streams, stripeFooter); + } + // if we skipped the first row group, move the pointers forward + if (rowInStripe != 0) { + seekToRowEntry(reader, (int) (rowInStripe / rowIndexStride)); + } + } + } + + private void clearStreams() throws IOException { + // explicit close of all streams to de-ref ByteBuffers + for (InStream is : streams.values()) { + is.close(); + } + if (bufferChunks != null) { + if (dataReader.isTrackingDiskRanges()) { + for (DiskRangeList range = bufferChunks; range != null; range = range.next) { + if (!(range instanceof BufferChunk)) { + continue; + } + dataReader.releaseBuffer(((BufferChunk) range).getChunk()); + } + } + } + bufferChunks = null; + streams.clear(); + } + + OrcProto.StripeFooter readStripeFooter(OrcProto.StripeInformation stripe) throws IOException { + return metadata.readStripeFooter(stripe); + } + + private OrcProto.StripeInformation beginReadStripe() throws IOException { + OrcProto.StripeInformation stripe = stripes.get(currentStripe); + stripeFooter = readStripeFooter(stripe); + clearStreams(); + // setup the position in the stripe + rowCountInStripe = stripe.getNumberOfRows(); + rowInStripe = 0; + rowBaseInStripe = 0; + for (int i = 0; i < currentStripe; ++i) { + rowBaseInStripe += stripes.get(i).getNumberOfRows(); + } + // reset all of the indexes + for (int i = 0; i < indexes.length; ++i) { + indexes[i] = null; + } + return stripe; + } + + private void readAllDataStreams(OrcProto.StripeInformation stripe) throws IOException { + long start = stripe.getIndexLength(); + long end = start + stripe.getDataLength(); + // explicitly trigger 1 big read + DiskRangeList toRead = new DiskRangeList(start, end); + bufferChunks = dataReader.readFileData(toRead, stripe.getOffset(), false); + List streamDescriptions = stripeFooter.getStreamsList(); + createStreams(streamDescriptions, bufferChunks, included, codec, bufferSize, streams); + } + + public long getRowNumber() { + return rowInStripe + rowBaseInStripe + firstRow; + } + + public float getProgress() { + return ((float) rowBaseInStripe + rowInStripe) / totalRowCount; + } + + private int findStripe(long rowNumber) { + for (int i = 0; i < stripes.size(); i++) { + OrcProto.StripeInformation stripe = stripes.get(i); + if (stripe.getNumberOfRows() > rowNumber) { + return i; + } + rowNumber -= stripe.getNumberOfRows(); + } + throw new IllegalArgumentException("Seek after the end of reader range"); + } + + OrcIndex readRowIndex( + int stripeIndex, boolean[] included) throws IOException { + return readRowIndex(stripeIndex, included, null, null); + } + + OrcIndex readRowIndex(int stripeIndex, boolean[] included, OrcProto.RowIndex[] indexes, + OrcProto.BloomFilterIndex[] bloomFilterIndex) throws IOException { + OrcProto.StripeInformation stripe = stripes.get(stripeIndex); + OrcProto.StripeFooter stripeFooter = null; + // if this is the current stripe, use the cached objects. + if (stripeIndex == currentStripe) { + stripeFooter = this.stripeFooter; + indexes = indexes == null ? this.indexes : indexes; + bloomFilterIndex = bloomFilterIndex == null ? this.bloomFilterIndices : bloomFilterIndex; + } + return metadata.readRowIndex(stripe, stripeFooter, included, indexes, null, + bloomFilterIndex); + } + + private void seekToRowEntry(TreeReaderFactory.TreeReader []reader, int rowEntry) + throws IOException { + PositionProvider[] index = new PositionProvider[indexes.length]; + for (int i = 0; i < indexes.length; ++i) { + if (indexes[i] != null) { + index[i] = new PositionProviderImpl(indexes[i].getEntry(rowEntry)); + } + } + for (TreeReaderFactory.TreeReader eachReader : reader) { + eachReader.seek(index); + } + } + + public void seekToRow(long rowNumber) throws IOException { + if (rowNumber < 0) { + throw new IllegalArgumentException("Seek to a negative row number " + + rowNumber); + } else if (rowNumber < firstRow) { + throw new IllegalArgumentException("Seek before reader range " + + rowNumber); + } + // convert to our internal form (rows from the beginning of slice) + rowNumber -= firstRow; + + // move to the right stripe + int rightStripe = findStripe(rowNumber); + if (rightStripe != currentStripe) { + currentStripe = rightStripe; + readStripe(); + } + readRowIndex(currentStripe, included); + + // if we aren't to the right row yet, advance in the stripe. + advanceToNextRow(reader, rowNumber, true); + } + + public long getNumBytes() { + return ((RecordReaderUtils.DefaultDataReader)dataReader).getReadBytes(); + } + + @Override + public void close() throws IOException { + clearStreams(); + dataReader.close(); + } + + public static final class PositionProviderImpl implements PositionProvider { + private final OrcProto.RowIndexEntry entry; + private int index; + + public PositionProviderImpl(OrcProto.RowIndexEntry entry) { + this(entry, 0); + } + + public PositionProviderImpl(OrcProto.RowIndexEntry entry, int startPos) { + this.entry = entry; + this.index = startPos; + } + + @Override + public long getNext() { + return entry.getPositions(index++); + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java index 3a474dd188..5c7fa458ee 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java @@ -21,6 +21,10 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.serde2.objectinspector.*; +import org.apache.orc.*; +import org.apache.orc.CompressionCodec; +import org.apache.orc.impl.*; +import org.apache.orc.impl.SnappyCodec; import java.util.Arrays; import java.util.HashMap; @@ -198,4 +202,35 @@ public static int getFlattenedColumnsCount(ObjectInspector inspector) { return numWriters; } + public static org.apache.orc.CompressionCodec createCodec(org.apache.orc.CompressionKind kind) { + switch (kind) { + case NONE: + return null; + case ZLIB: + return new org.apache.orc.impl.ZlibCodec(); + case SNAPPY: + return new SnappyCodec(); + case LZO: + try { + ClassLoader loader = Thread.currentThread().getContextClassLoader(); + if (loader == null) { + throw new RuntimeException("error while getting a class loader"); + } + @SuppressWarnings("unchecked") + Class lzo = + (Class) + loader.loadClass("org.apache.hadoop.hive.ql.io.orc.LzoCodec"); + return lzo.newInstance(); + } catch (ClassNotFoundException e) { + throw new IllegalArgumentException("LZO is not available.", e); + } catch (InstantiationException e) { + throw new IllegalArgumentException("Problem initializing LZO", e); + } catch (IllegalAccessException e) { + throw new IllegalArgumentException("Insufficient access to LZO", e); + } + default: + throw new IllegalArgumentException("Unknown compression codec: " + + kind); + } + } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RecordReaderUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RecordReaderUtils.java new file mode 100644 index 0000000000..5253711664 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RecordReaderUtils.java @@ -0,0 +1,479 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.collect.ComparisonChain; +import org.apache.commons.lang.builder.HashCodeBuilder; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.io.DiskRange; +import org.apache.hadoop.hive.common.io.DiskRangeList; +import org.apache.hadoop.hive.shims.HadoopShims; +import org.apache.hadoop.hive.shims.ShimLoader; +import org.apache.orc.CompressionCodec; +import org.apache.orc.DataReader; +import org.apache.orc.OrcProto; +import org.apache.orc.impl.BufferChunk; +import org.apache.orc.impl.DirectDecompressionCodec; +import org.apache.orc.impl.OutStream; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +public class RecordReaderUtils { + + public static class DefaultDataReader implements DataReader { + private FSDataInputStream file; + private ByteBufferAllocatorPool pool; + private HadoopShims.ZeroCopyReaderShim zcr; + private FileSystem fs; + private Path path; + private boolean useZeroCopy; + private CompressionCodec codec; + private long readBytes = 0; + + public DefaultDataReader( + FileSystem fs, Path path, boolean useZeroCopy, CompressionCodec codec) { + this.fs = fs; + this.path = path; + this.useZeroCopy = useZeroCopy; + this.codec = codec; + } + + @Override + public void open() throws IOException { + this.file = fs.open(path); + if (useZeroCopy) { + pool = new ByteBufferAllocatorPool(); + zcr = RecordReaderUtils.createZeroCopyShim(file, codec, pool); + } else { + pool = null; + zcr = null; + } + } + + @Override + public DiskRangeList readFileData( + DiskRangeList range, long baseOffset, boolean doForceDirect) throws IOException { + return readDiskRanges(file, zcr, baseOffset, range, doForceDirect); + } + + @Override + public void close() throws IOException { + if (file != null) { + file.close(); + } + if (pool != null) { + pool.clear(); + } + } + + @Override + public boolean isTrackingDiskRanges() { + return zcr != null; + } + + @Override + public void releaseBuffer(ByteBuffer buffer) { + zcr.releaseBuffer(buffer); + } + + public long getReadBytes() { + return readBytes; + } + + /** + * Read the list of ranges from the file. + * @param file the file to read + * @param base the base of the stripe + * @param range the disk ranges within the stripe to read + * @return the bytes read for each disk range, which is the same length as + * ranges + * @throws IOException + */ + private DiskRangeList readDiskRanges(FSDataInputStream file, + HadoopShims.ZeroCopyReaderShim zcr, + long base, + DiskRangeList range, + boolean doForceDirect) throws IOException { + if (range == null) return null; + DiskRangeList prev = range.prev; + if (prev == null) { + prev = new DiskRangeList.MutateHelper(range); + } + while (range != null) { + if (range.hasData()) { + range = range.next; + continue; + } + int len = (int) (range.getEnd() - range.getOffset()); + long off = range.getOffset(); + if (zcr != null) { + file.seek(base + off); + boolean hasReplaced = false; + while (len > 0) { + ByteBuffer partial = zcr.readBuffer(len, false); + readBytes += partial.remaining(); + BufferChunk bc = new BufferChunk(partial, off); + if (!hasReplaced) { + range.replaceSelfWith(bc); + hasReplaced = true; + } else { + range.insertAfter(bc); + } + range = bc; + int read = partial.remaining(); + len -= read; + off += read; + } + } else { + // Don't use HDFS ByteBuffer API because it has no readFully, and is buggy and pointless. + byte[] buffer = new byte[len]; + file.readFully((base + off), buffer, 0, buffer.length); + readBytes += buffer.length; + ByteBuffer bb = null; + if (doForceDirect) { + bb = ByteBuffer.allocateDirect(len); + bb.put(buffer); + bb.position(0); + bb.limit(len); + } else { + bb = ByteBuffer.wrap(buffer); + } + range = range.replaceSelfWith(new BufferChunk(bb, range.getOffset())); + } + range = range.next; + } + return prev.next; + } + } + + public static DataReader createDefaultDataReader( + FileSystem fs, Path path, boolean useZeroCopy, CompressionCodec codec) { + return new DefaultDataReader(fs, path, useZeroCopy, codec); + } + + public static boolean[] findPresentStreamsByColumn( + List streamList, List types) { + boolean[] hasNull = new boolean[types.size()]; + for(OrcProto.Stream stream: streamList) { + if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.PRESENT)) { + hasNull[stream.getColumn()] = true; + } + } + return hasNull; + } + + /** + * Does region A overlap region B? The end points are inclusive on both sides. + * @param leftA A's left point + * @param rightA A's right point + * @param leftB B's left point + * @param rightB B's right point + * @return Does region A overlap region B? + */ + static boolean overlap(long leftA, long rightA, long leftB, long rightB) { + if (leftA <= leftB) { + return rightA >= leftB; + } + return rightB >= leftA; + } + + public static void addEntireStreamToRanges( + long offset, long length, DiskRangeList.CreateHelper list, boolean doMergeBuffers) { + list.addOrMerge(offset, offset + length, doMergeBuffers, false); + } + + public static void addRgFilteredStreamToRanges(OrcProto.Stream stream, + boolean[] includedRowGroups, boolean isCompressed, OrcProto.RowIndex index, + OrcProto.ColumnEncoding encoding, OrcProto.Type type, int compressionSize, boolean hasNull, + long offset, long length, DiskRangeList.CreateHelper list, boolean doMergeBuffers) { + for (int group = 0; group < includedRowGroups.length; ++group) { + if (!includedRowGroups[group]) continue; + int posn = getIndexPosition( + encoding.getKind(), type.getKind(), stream.getKind(), isCompressed, hasNull); + long start = index.getEntry(group).getPositions(posn); + final long nextGroupOffset; + boolean isLast = group == (includedRowGroups.length - 1); + nextGroupOffset = isLast ? length : index.getEntry(group + 1).getPositions(posn); + + start += offset; + long end = offset + estimateRgEndOffset( + isCompressed, isLast, nextGroupOffset, length, compressionSize); + list.addOrMerge(start, end, doMergeBuffers, true); + } + } + + public static long estimateRgEndOffset(boolean isCompressed, boolean isLast, + long nextGroupOffset, long streamLength, int bufferSize) { + // figure out the worst case last location + // if adjacent groups have the same compressed block offset then stretch the slop + // by factor of 2 to safely accommodate the next compression block. + // One for the current compression block and another for the next compression block. + long slop = isCompressed ? 2 * (OutStream.HEADER_SIZE + bufferSize) : WORST_UNCOMPRESSED_SLOP; + return isLast ? streamLength : Math.min(streamLength, nextGroupOffset + slop); + } + + private static final int BYTE_STREAM_POSITIONS = 1; + private static final int RUN_LENGTH_BYTE_POSITIONS = BYTE_STREAM_POSITIONS + 1; + private static final int BITFIELD_POSITIONS = RUN_LENGTH_BYTE_POSITIONS + 1; + private static final int RUN_LENGTH_INT_POSITIONS = BYTE_STREAM_POSITIONS + 1; + + /** + * Get the offset in the index positions for the column that the given + * stream starts. + * @param columnEncoding the encoding of the column + * @param columnType the type of the column + * @param streamType the kind of the stream + * @param isCompressed is the file compressed + * @param hasNulls does the column have a PRESENT stream? + * @return the number of positions that will be used for that stream + */ + public static int getIndexPosition(OrcProto.ColumnEncoding.Kind columnEncoding, + OrcProto.Type.Kind columnType, + OrcProto.Stream.Kind streamType, + boolean isCompressed, + boolean hasNulls) { + if (streamType == OrcProto.Stream.Kind.PRESENT) { + return 0; + } + int compressionValue = isCompressed ? 1 : 0; + int base = hasNulls ? (BITFIELD_POSITIONS + compressionValue) : 0; + switch (columnType) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + case FLOAT: + case DOUBLE: + case DATE: + case STRUCT: + case MAP: + case LIST: + case UNION: + return base; + case CHAR: + case VARCHAR: + case STRING: + if (columnEncoding == OrcProto.ColumnEncoding.Kind.DICTIONARY || + columnEncoding == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) { + return base; + } else { + if (streamType == OrcProto.Stream.Kind.DATA) { + return base; + } else { + return base + BYTE_STREAM_POSITIONS + compressionValue; + } + } + case BINARY: + if (streamType == OrcProto.Stream.Kind.DATA) { + return base; + } + return base + BYTE_STREAM_POSITIONS + compressionValue; + case DECIMAL: + if (streamType == OrcProto.Stream.Kind.DATA) { + return base; + } + return base + BYTE_STREAM_POSITIONS + compressionValue; + case TIMESTAMP: + if (streamType == OrcProto.Stream.Kind.DATA) { + return base; + } + return base + RUN_LENGTH_INT_POSITIONS + compressionValue; + default: + throw new IllegalArgumentException("Unknown type " + columnType); + } + } + + // for uncompressed streams, what is the most overlap with the following set + // of rows (long vint literal group). + static final int WORST_UNCOMPRESSED_SLOP = 2 + 8 * 512; + + /** + * Is this stream part of a dictionary? + * @return is this part of a dictionary? + */ + public static boolean isDictionary(OrcProto.Stream.Kind kind, + OrcProto.ColumnEncoding encoding) { + assert kind != OrcProto.Stream.Kind.DICTIONARY_COUNT; + OrcProto.ColumnEncoding.Kind encodingKind = encoding.getKind(); + return kind == OrcProto.Stream.Kind.DICTIONARY_DATA || + (kind == OrcProto.Stream.Kind.LENGTH && + (encodingKind == OrcProto.ColumnEncoding.Kind.DICTIONARY || + encodingKind == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2)); + } + + /** + * Build a string representation of a list of disk ranges. + * @param range ranges to stringify + * @return the resulting string + */ + public static String stringifyDiskRanges(DiskRangeList range) { + StringBuilder buffer = new StringBuilder(); + buffer.append("["); + boolean isFirst = true; + while (range != null) { + if (!isFirst) { + buffer.append(", {"); + } else { + buffer.append("{"); + } + isFirst = false; + buffer.append(range.toString()); + buffer.append("}"); + range = range.next; + } + buffer.append("]"); + return buffer.toString(); + } + + public static List getStreamBuffers(DiskRangeList range, long offset, long length) { + // This assumes sorted ranges (as do many other parts of ORC code. + ArrayList buffers = new ArrayList(); + if (length == 0) return buffers; + long streamEnd = offset + length; + boolean inRange = false; + while (range != null) { + if (!inRange) { + if (range.getEnd() <= offset) { + range = range.next; + continue; // Skip until we are in range. + } + inRange = true; + if (range.getOffset() < offset) { + // Partial first buffer, add a slice of it. + buffers.add(range.sliceAndShift(offset, Math.min(streamEnd, range.getEnd()), -offset)); + if (range.getEnd() >= streamEnd) break; // Partial first buffer is also partial last buffer. + range = range.next; + continue; + } + } else if (range.getOffset() >= streamEnd) { + break; + } + if (range.getEnd() > streamEnd) { + // Partial last buffer (may also be the first buffer), add a slice of it. + buffers.add(range.sliceAndShift(range.getOffset(), streamEnd, -offset)); + break; + } + // Buffer that belongs entirely to one stream. + // TODO: ideally we would want to reuse the object and remove it from the list, but we cannot + // because bufferChunks is also used by clearStreams for zcr. Create a useless dup. + buffers.add(range.sliceAndShift(range.getOffset(), range.getEnd(), -offset)); + if (range.getEnd() == streamEnd) break; + range = range.next; + } + return buffers; + } + + static HadoopShims.ZeroCopyReaderShim createZeroCopyShim(FSDataInputStream file, + CompressionCodec codec, ByteBufferAllocatorPool pool) throws IOException { + if ((codec == null || ((codec instanceof DirectDecompressionCodec) + && ((DirectDecompressionCodec) codec).isAvailable()))) { + /* codec is null or is available */ + return ShimLoader.getHadoopShims().getZeroCopyReader(file, pool); + } + return null; + } + + // this is an implementation copied from ElasticByteBufferPool in hadoop-2, + // which lacks a clear()/clean() operation + public final static class ByteBufferAllocatorPool implements HadoopShims.ByteBufferPoolShim { + private static final class Key implements Comparable { + private final int capacity; + private final long insertionGeneration; + + Key(int capacity, long insertionGeneration) { + this.capacity = capacity; + this.insertionGeneration = insertionGeneration; + } + + @Override + public int compareTo(Key other) { + return ComparisonChain.start().compare(capacity, other.capacity) + .compare(insertionGeneration, other.insertionGeneration).result(); + } + + @Override + public boolean equals(Object rhs) { + if (rhs == null) { + return false; + } + try { + Key o = (Key) rhs; + return (compareTo(o) == 0); + } catch (ClassCastException e) { + return false; + } + } + + @Override + public int hashCode() { + return new HashCodeBuilder().append(capacity).append(insertionGeneration) + .toHashCode(); + } + } + + private final TreeMap buffers = new TreeMap(); + + private final TreeMap directBuffers = new TreeMap(); + + private long currentGeneration = 0; + + private final TreeMap getBufferTree(boolean direct) { + return direct ? directBuffers : buffers; + } + + public void clear() { + buffers.clear(); + directBuffers.clear(); + } + + @Override + public ByteBuffer getBuffer(boolean direct, int length) { + TreeMap tree = getBufferTree(direct); + Map.Entry entry = tree.ceilingEntry(new Key(length, 0)); + if (entry == null) { + return direct ? ByteBuffer.allocateDirect(length) : ByteBuffer + .allocate(length); + } + tree.remove(entry.getKey()); + return entry.getValue(); + } + + @Override + public void putBuffer(ByteBuffer buffer) { + TreeMap tree = getBufferTree(buffer.isDirect()); + while (true) { + Key key = new Key(buffer.capacity(), currentGeneration++); + if (!tree.containsKey(key)) { + tree.put(key, buffer); + return; + } + // Buffers are indexed by (capacity, generation). + // If our key is not unique on the first try, we try again + } + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java new file mode 100644 index 0000000000..c1781ef6a6 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java @@ -0,0 +1,1576 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.io.Text; +import org.apache.orc.OrcProto; +import org.apache.orc.impl.*; +import org.apache.orc.impl.DynamicByteArray; +import org.apache.orc.impl.SerializationUtils; +import org.apache.orc.impl.StreamName; +import org.apache.orc.impl.WriterImpl; +import org.apache.tajo.TajoConstants; +import org.apache.tajo.catalog.Column; +import org.apache.tajo.catalog.TableMeta; +import org.apache.tajo.catalog.TypeDesc; +import org.apache.tajo.datum.Datum; +import org.apache.tajo.datum.DatumFactory; +import org.apache.tajo.datum.NullDatum; +import org.apache.tajo.exception.TajoRuntimeException; +import org.apache.tajo.exception.UnsupportedException; +import org.apache.tajo.storage.StorageConstants; +import org.apache.tajo.unit.TimeUnit; +import org.apache.tajo.util.datetime.DateTimeUtil; + +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TimeZone; + +public class TreeReaderFactory { + + private final static Log LOG = LogFactory.getLog(TreeReaderFactory.class); + + public static class TreeReaderSchema { + + /** + * The types in the ORC file. + */ + List fileTypes; + + /** + * The treeReaderSchema that the reader should read as. + */ + List schemaTypes; + + /** + * The subtype of the row STRUCT. Different than 0 for ACID. + */ + int innerStructSubtype; + + public TreeReaderSchema() { + fileTypes = null; + schemaTypes = null; + innerStructSubtype = -1; + } + + public TreeReaderSchema fileTypes(List fileTypes) { + this.fileTypes = fileTypes; + return this; + } + + public TreeReaderSchema schemaTypes(List schemaTypes) { + this.schemaTypes = schemaTypes; + return this; + } + + public TreeReaderSchema innerStructSubtype(int innerStructSubtype) { + this.innerStructSubtype = innerStructSubtype; + return this; + } + + public List getFileTypes() { + return fileTypes; + } + + public List getSchemaTypes() { + return schemaTypes; + } + + public int getInnerStructSubtype() { + return innerStructSubtype; + } + } + + public abstract static class TreeReader { + protected final int columnId; + protected BitFieldReader present = null; + protected boolean valuePresent = false; + + TreeReader(int columnId) throws IOException { + this(columnId, null); + } + + protected TreeReader(int columnId, InStream in) throws IOException { + this.columnId = columnId; + if (in == null) { + present = null; + valuePresent = true; + } else { + present = new BitFieldReader(in, 1); + } + } + + void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException { + if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) { + throw new IOException("Unknown encoding " + encoding + " in column " + + columnId); + } + } + + static IntegerReader createIntegerReader(OrcProto.ColumnEncoding.Kind kind, + InStream in, + boolean signed, boolean skipCorrupt) throws IOException { + switch (kind) { + case DIRECT_V2: + case DICTIONARY_V2: + return new RunLengthIntegerReaderV2(in, signed, skipCorrupt); + case DIRECT: + case DICTIONARY: + return new RunLengthIntegerReader(in, signed); + default: + throw new IllegalArgumentException("Unknown encoding " + kind); + } + } + + void startStripe(Map streams, + OrcProto.StripeFooter stripeFooter + ) throws IOException { + checkEncoding(stripeFooter.getColumnsList().get(columnId)); + InStream in = streams.get(new org.apache.orc.impl.StreamName(columnId, + OrcProto.Stream.Kind.PRESENT)); + if (in == null) { + present = null; + valuePresent = true; + } else { + present = new BitFieldReader(in, 1); + } + } + + /** + * Seek to the given position. + * + * @param index the indexes loaded from the file + * @throws IOException + */ + void seek(PositionProvider[] index) throws IOException { + seek(index[columnId]); + } + + public void seek(PositionProvider index) throws IOException { + if (present != null) { + present.seek(index); + } + } + + protected long countNonNulls(long rows) throws IOException { + if (present != null) { + long result = 0; + for (long c = 0; c < rows; ++c) { + if (present.next() == 1) { + result += 1; + } + } + return result; + } else { + return rows; + } + } + + abstract void skipRows(long rows) throws IOException; + + public BitFieldReader getPresent() { + return present; + } + } + + public abstract static class DatumTreeReader extends TreeReader { + + DatumTreeReader(int columnId) throws IOException { + super(columnId); + } + + protected DatumTreeReader(int columnId, InStream in) throws IOException { + super(columnId, in); + } + + Datum next() throws IOException { + if (present != null) { + valuePresent = present.next() == 1; + } + return NullDatum.get(); + } + } + + public abstract static class RawStringTreeReader extends TreeReader { + RawStringTreeReader(int columnId) throws IOException { + super(columnId); + } + + protected RawStringTreeReader(int columnId, InStream in) throws IOException { + super(columnId, in); + } + + byte[] next() throws IOException { + if (present != null) { + valuePresent = present.next() == 1; + } + return null; + } + } + + public static class BooleanTreeReader extends DatumTreeReader { + protected BitFieldReader reader = null; + + BooleanTreeReader(int columnId) throws IOException { + this(columnId, null, null); + } + + protected BooleanTreeReader(int columnId, InStream present, InStream data) throws IOException { + super(columnId, present); + if (data != null) { + reader = new BitFieldReader(data, 1); + } + } + + @Override + void startStripe(Map streams, + OrcProto.StripeFooter stripeFooter + ) throws IOException { + super.startStripe(streams, stripeFooter); + reader = new BitFieldReader(streams.get(new org.apache.orc.impl.StreamName(columnId, + OrcProto.Stream.Kind.DATA)), 1); + } + + @Override + void seek(PositionProvider[] index) throws IOException { + seek(index[columnId]); + } + + @Override + public void seek(PositionProvider index) throws IOException { + super.seek(index); + reader.seek(index); + } + + @Override + void skipRows(long items) throws IOException { + reader.skip(countNonNulls(items)); + } + + @Override + Datum next() throws IOException { + super.next(); + return valuePresent ? DatumFactory.createBool(reader.next() == 1) : NullDatum.get(); + } + } + + public static class ByteTreeReader extends DatumTreeReader { + protected RunLengthByteReader reader = null; + + ByteTreeReader(int columnId) throws IOException { + this(columnId, null, null); + } + + protected ByteTreeReader(int columnId, InStream present, InStream data) throws IOException { + super(columnId, present); + this.reader = new RunLengthByteReader(data); + } + + @Override + void startStripe(Map streams, + OrcProto.StripeFooter stripeFooter + ) throws IOException { + super.startStripe(streams, stripeFooter); + reader = new RunLengthByteReader(streams.get(new org.apache.orc.impl.StreamName(columnId, + OrcProto.Stream.Kind.DATA))); + } + + @Override + void seek(PositionProvider[] index) throws IOException { + seek(index[columnId]); + } + + @Override + public void seek(PositionProvider index) throws IOException { + super.seek(index); + reader.seek(index); + } + + @Override + Datum next() throws IOException { + super.next(); + return valuePresent ? DatumFactory.createBit(reader.next()) : NullDatum.get(); + } + + @Override + void skipRows(long items) throws IOException { + reader.skip(countNonNulls(items)); + } + } + + public static class ShortTreeReader extends DatumTreeReader { + protected IntegerReader reader = null; + + ShortTreeReader(int columnId) throws IOException { + this(columnId, null, null, null); + } + + protected ShortTreeReader(int columnId, InStream present, InStream data, + OrcProto.ColumnEncoding encoding) + throws IOException { + super(columnId, present); + if (data != null && encoding != null) { + checkEncoding(encoding); + this.reader = createIntegerReader(encoding.getKind(), data, true, false); + } + } + + @Override + void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException { + if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) && + (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) { + throw new IOException("Unknown encoding " + encoding + " in column " + + columnId); + } + } + + @Override + void startStripe(Map streams, + OrcProto.StripeFooter stripeFooter + ) throws IOException { + super.startStripe(streams, stripeFooter); + org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId, + OrcProto.Stream.Kind.DATA); + reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(), + streams.get(name), true, false); + } + + @Override + void seek(PositionProvider[] index) throws IOException { + seek(index[columnId]); + } + + @Override + public void seek(PositionProvider index) throws IOException { + super.seek(index); + reader.seek(index); + } + + @Override + Datum next() throws IOException { + super.next(); + return valuePresent ? DatumFactory.createInt2((short) reader.next()) : NullDatum.get(); + } + + @Override + void skipRows(long items) throws IOException { + reader.skip(countNonNulls(items)); + } + } + + public static class InetTreeReader extends DatumTreeReader { + protected IntegerReader reader = null; + + InetTreeReader(int columnId) throws IOException { + this(columnId, null, null, null); + } + + protected InetTreeReader(int columnId, InStream present, InStream data, + OrcProto.ColumnEncoding encoding) + throws IOException { + super(columnId, present); + if (data != null && encoding != null) { + checkEncoding(encoding); + this.reader = createIntegerReader(encoding.getKind(), data, true, false); + } + } + + @Override + void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException { + if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) && + (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) { + throw new IOException("Unknown encoding " + encoding + " in column " + + columnId); + } + } + + @Override + void startStripe(Map streams, + OrcProto.StripeFooter stripeFooter + ) throws IOException { + super.startStripe(streams, stripeFooter); + org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId, + OrcProto.Stream.Kind.DATA); + reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(), + streams.get(name), true, false); + } + + @Override + void seek(PositionProvider[] index) throws IOException { + seek(index[columnId]); + } + + @Override + Datum next() throws IOException { + super.next(); + return valuePresent ? DatumFactory.createInet4((int) reader.next()) : NullDatum.get(); + } + + @Override + void skipRows(long items) throws IOException { + reader.skip(countNonNulls(items)); + } + } + + public static class IntTreeReader extends DatumTreeReader { + protected IntegerReader reader = null; + + IntTreeReader(int columnId) throws IOException { + this(columnId, null, null, null); + } + + protected IntTreeReader(int columnId, InStream present, InStream data, + OrcProto.ColumnEncoding encoding) + throws IOException { + super(columnId, present); + if (data != null && encoding != null) { + checkEncoding(encoding); + this.reader = createIntegerReader(encoding.getKind(), data, true, false); + } + } + + @Override + void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException { + if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) && + (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) { + throw new IOException("Unknown encoding " + encoding + " in column " + + columnId); + } + } + + @Override + void startStripe(Map streams, + OrcProto.StripeFooter stripeFooter + ) throws IOException { + super.startStripe(streams, stripeFooter); + org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId, + OrcProto.Stream.Kind.DATA); + reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(), + streams.get(name), true, false); + } + + @Override + void seek(PositionProvider[] index) throws IOException { + seek(index[columnId]); + } + + @Override + public void seek(PositionProvider index) throws IOException { + super.seek(index); + reader.seek(index); + } + + @Override + Datum next() throws IOException { + super.next(); + return valuePresent ? DatumFactory.createInt4((int) reader.next()) : NullDatum.get(); + } + + @Override + void skipRows(long items) throws IOException { + reader.skip(countNonNulls(items)); + } + } + + public static class LongTreeReader extends DatumTreeReader { + protected IntegerReader reader = null; + + LongTreeReader(int columnId, boolean skipCorrupt) throws IOException { + this(columnId, null, null, null, skipCorrupt); + } + + protected LongTreeReader(int columnId, InStream present, InStream data, + OrcProto.ColumnEncoding encoding, + boolean skipCorrupt) + throws IOException { + super(columnId, present); + if (data != null && encoding != null) { + checkEncoding(encoding); + this.reader = createIntegerReader(encoding.getKind(), data, true, skipCorrupt); + } + } + + @Override + void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException { + if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) && + (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) { + throw new IOException("Unknown encoding " + encoding + " in column " + + columnId); + } + } + + @Override + void startStripe(Map streams, + OrcProto.StripeFooter stripeFooter + ) throws IOException { + super.startStripe(streams, stripeFooter); + org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId, + OrcProto.Stream.Kind.DATA); + reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(), + streams.get(name), true, false); + } + + @Override + void seek(PositionProvider[] index) throws IOException { + seek(index[columnId]); + } + + @Override + public void seek(PositionProvider index) throws IOException { + super.seek(index); + reader.seek(index); + } + + @Override + Datum next() throws IOException { + super.next(); + return valuePresent ? DatumFactory.createInt8(reader.next()) : NullDatum.get(); + } + + @Override + void skipRows(long items) throws IOException { + reader.skip(countNonNulls(items)); + } + } + + public static class FloatTreeReader extends DatumTreeReader { + protected InStream stream; + private final org.apache.orc.impl.SerializationUtils utils; + + FloatTreeReader(int columnId) throws IOException { + this(columnId, null, null); + } + + protected FloatTreeReader(int columnId, InStream present, InStream data) throws IOException { + super(columnId, present); + this.utils = new org.apache.orc.impl.SerializationUtils(); + this.stream = data; + } + + @Override + void startStripe(Map streams, + OrcProto.StripeFooter stripeFooter + ) throws IOException { + super.startStripe(streams, stripeFooter); + org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId, + OrcProto.Stream.Kind.DATA); + stream = streams.get(name); + } + + @Override + void seek(PositionProvider[] index) throws IOException { + seek(index[columnId]); + } + + @Override + public void seek(PositionProvider index) throws IOException { + super.seek(index); + stream.seek(index); + } + + @Override + Datum next() throws IOException { + super.next(); + return valuePresent ? DatumFactory.createFloat4(utils.readFloat(stream)) : NullDatum.get(); + } + + @Override + protected void skipRows(long items) throws IOException { + items = countNonNulls(items); + for (int i = 0; i < items; ++i) { + utils.readFloat(stream); + } + } + } + + public static class DoubleTreeReader extends DatumTreeReader { + protected InStream stream; + private final org.apache.orc.impl.SerializationUtils utils; + + DoubleTreeReader(int columnId) throws IOException { + this(columnId, null, null); + } + + protected DoubleTreeReader(int columnId, InStream present, InStream data) throws IOException { + super(columnId, present); + this.utils = new SerializationUtils(); + this.stream = data; + } + + @Override + void startStripe(Map streams, + OrcProto.StripeFooter stripeFooter + ) throws IOException { + super.startStripe(streams, stripeFooter); + org.apache.orc.impl.StreamName name = + new org.apache.orc.impl.StreamName(columnId, + OrcProto.Stream.Kind.DATA); + stream = streams.get(name); + } + + @Override + void seek(PositionProvider[] index) throws IOException { + seek(index[columnId]); + } + + @Override + public void seek(PositionProvider index) throws IOException { + super.seek(index); + stream.seek(index); + } + + @Override + Datum next() throws IOException { + super.next(); + return valuePresent ? DatumFactory.createFloat8(utils.readDouble(stream)) : NullDatum.get(); + } + + @Override + void skipRows(long items) throws IOException { + items = countNonNulls(items); + long len = items * 8; + while (len > 0) { + len -= stream.skip(len); + } + } + } + + public static class BinaryTreeReader extends DatumTreeReader { + protected InStream stream; + protected IntegerReader lengths = null; + protected final LongColumnVector scratchlcv; + + BinaryTreeReader(int columnId) throws IOException { + this(columnId, null, null, null, null); + } + + protected BinaryTreeReader(int columnId, InStream present, InStream data, InStream length, + OrcProto.ColumnEncoding encoding) throws IOException { + super(columnId, present); + scratchlcv = new LongColumnVector(); + this.stream = data; + if (length != null && encoding != null) { + checkEncoding(encoding); + this.lengths = createIntegerReader(encoding.getKind(), length, false, false); + } + } + + @Override + void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException { + if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) && + (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) { + throw new IOException("Unknown encoding " + encoding + " in column " + + columnId); + } + } + + @Override + void startStripe(Map streams, + OrcProto.StripeFooter stripeFooter + ) throws IOException { + super.startStripe(streams, stripeFooter); + org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId, + OrcProto.Stream.Kind.DATA); + stream = streams.get(name); + lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(), + streams.get(new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.LENGTH)), false, false); + } + + @Override + void seek(PositionProvider[] index) throws IOException { + seek(index[columnId]); + } + + @Override + public void seek(PositionProvider index) throws IOException { + super.seek(index); + stream.seek(index); + lengths.seek(index); + } + + @Override + Datum next() throws IOException { + super.next(); + + if (valuePresent) { + int len = (int) lengths.next(); + byte[] buf = new byte[len]; + int offset = 0; + while (len > 0) { + int written = stream.read(buf, offset, len); + if (written < 0) { + throw new EOFException("Can't finish byte read from " + stream); + } + len -= written; + offset += written; + } + return DatumFactory.createBlob(buf); + } else { + return NullDatum.get(); + } + } + + @Override + void skipRows(long items) throws IOException { + items = countNonNulls(items); + long lengthToSkip = 0; + for (int i = 0; i < items; ++i) { + lengthToSkip += lengths.next(); + } + while (lengthToSkip > 0) { + lengthToSkip -= stream.skip(lengthToSkip); + } + } + } + + public static class TimestampTreeReader extends DatumTreeReader { + protected IntegerReader data = null; + protected IntegerReader nanos = null; + private final boolean skipCorrupt; + private Map baseTimestampMap; + private long base_timestamp; + private final TimeZone readerTimeZone; + private TimeZone writerTimeZone; + private boolean hasSameTZRules; + + TimestampTreeReader(TableMeta meta, int columnId, boolean skipCorrupt) throws IOException { + this(meta, columnId, null, null, null, null, skipCorrupt); + } + + protected TimestampTreeReader(TableMeta meta, int columnId, InStream presentStream, InStream dataStream, + InStream nanosStream, OrcProto.ColumnEncoding encoding, boolean skipCorrupt) + throws IOException { + super(columnId, presentStream); + this.skipCorrupt = skipCorrupt; + this.baseTimestampMap = new HashMap<>(); + this.readerTimeZone = TimeZone.getDefault(); + this.writerTimeZone = readerTimeZone; + this.hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone); + this.base_timestamp = getBaseTimestamp(TimeZone.getTimeZone(meta.getProperty(StorageConstants.TIMEZONE, + TajoConstants.DEFAULT_SYSTEM_TIMEZONE)).getID()); + if (encoding != null) { + checkEncoding(encoding); + + if (dataStream != null) { + this.data = createIntegerReader(encoding.getKind(), dataStream, true, skipCorrupt); + } + + if (nanosStream != null) { + this.nanos = createIntegerReader(encoding.getKind(), nanosStream, false, skipCorrupt); + } + } + } + + @Override + void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException { + if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) && + (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) { + throw new IOException("Unknown encoding " + encoding + " in column " + + columnId); + } + } + + @Override + void startStripe(Map streams, + OrcProto.StripeFooter stripeFooter + ) throws IOException { + super.startStripe(streams, stripeFooter); + data = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(), + streams.get(new org.apache.orc.impl.StreamName(columnId, + OrcProto.Stream.Kind.DATA)), true, skipCorrupt); + nanos = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(), + streams.get(new org.apache.orc.impl.StreamName(columnId, + OrcProto.Stream.Kind.SECONDARY)), false, skipCorrupt); + base_timestamp = getBaseTimestamp(stripeFooter.getWriterTimezone()); + } + + private long getBaseTimestamp(String timeZoneId) throws IOException { + // to make sure new readers read old files in the same way + if (timeZoneId == null || timeZoneId.isEmpty()) { + timeZoneId = readerTimeZone.getID(); + } + + if (!baseTimestampMap.containsKey(timeZoneId)) { + writerTimeZone = TimeZone.getTimeZone(timeZoneId); + hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + sdf.setTimeZone(writerTimeZone); + try { + long epoch = + sdf.parse(org.apache.orc.impl.WriterImpl.BASE_TIMESTAMP_STRING).getTime() / WriterImpl.MILLIS_PER_SECOND; + baseTimestampMap.put(timeZoneId, epoch); + return epoch; + } catch (ParseException e) { + throw new IOException("Unable to create base timestamp", e); + } finally { + sdf.setTimeZone(readerTimeZone); + } + } + + return baseTimestampMap.get(timeZoneId); + } + + @Override + void seek(PositionProvider[] index) throws IOException { + seek(index[columnId]); + } + + @Override + public void seek(PositionProvider index) throws IOException { + super.seek(index); + data.seek(index); + nanos.seek(index); + } + + @Override + Datum next() throws IOException { + super.next(); + + if (valuePresent) { + long millis = decodeTimestamp(data.next(), nanos.next(), base_timestamp); + long offset = 0; + // If reader and writer time zones have different rules, adjust the timezone difference + // between reader and writer taking day light savings into account. + if (!hasSameTZRules) { + offset = writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(millis); + } + long adjustedMillis = millis + offset; + + // Sometimes the reader timezone might have changed after adding the adjustedMillis. + // To account for that change, check for any difference in reader timezone after + // adding adjustedMillis. If so use the new offset (offset at adjustedMillis point of time). + if (!hasSameTZRules && + (readerTimeZone.getOffset(millis) != readerTimeZone.getOffset(adjustedMillis))) { + long newOffset = + writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(adjustedMillis); + adjustedMillis = millis + newOffset; + } + return DatumFactory.createTimestamp(DateTimeUtil.javaTimeToJulianTime(adjustedMillis)); + } else { + return NullDatum.get(); + } + } + + private static int parseNanos(long serialized) { + int zeros = 7 & (int) serialized; + int result = (int) (serialized >>> 3); + if (zeros != 0) { + for (int i = 0; i <= zeros; ++i) { + result *= 10; + } + } + return result; + } + + // borrowed from Facebook's TimestampStreamReader + private static long decodeTimestamp(long seconds, long serializedNanos, long baseTimestampInSeconds) { + long millis = (seconds + baseTimestampInSeconds) * TimeUnit.MILLIS_PER_SECOND; + long nanos = parseNanos(serializedNanos); + + // the rounding error exists because java always rounds up when dividing integers + // -42001/1000 = -42; and -42001 % 1000 = -1 (+ 1000) + // to get the correct value we need + // (-42 - 1)*1000 + 999 = -42001 + // (42)*1000 + 1 = 42001 + if (millis < 0 && nanos != 0) { + millis -= 1000; + } + // Truncate nanos to millis and add to mills + return millis + (nanos / 1_000_000); + } + + @Override + void skipRows(long items) throws IOException { + items = countNonNulls(items); + data.skip(items); + nanos.skip(items); + } + } + + public static class DateTreeReader extends DatumTreeReader { + protected IntegerReader reader = null; + + DateTreeReader(int columnId) throws IOException { + this(columnId, null, null, null); + } + + protected DateTreeReader(int columnId, InStream present, InStream data, + OrcProto.ColumnEncoding encoding) throws IOException { + super(columnId, present); + if (data != null && encoding != null) { + checkEncoding(encoding); + reader = createIntegerReader(encoding.getKind(), data, true, false); + } + } + + @Override + void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException { + if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) && + (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) { + throw new IOException("Unknown encoding " + encoding + " in column " + + columnId); + } + } + + @Override + void startStripe(Map streams, + OrcProto.StripeFooter stripeFooter + ) throws IOException { + super.startStripe(streams, stripeFooter); + org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId, + OrcProto.Stream.Kind.DATA); + reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(), + streams.get(name), true, false); + } + + @Override + void seek(PositionProvider[] index) throws IOException { + seek(index[columnId]); + } + + @Override + public void seek(PositionProvider index) throws IOException { + super.seek(index); + reader.seek(index); + } + + @Override + Datum next() throws IOException { + super.next(); + return valuePresent ? + DatumFactory.createDate((int) reader.next() + DateTimeUtil.DAYS_FROM_JULIAN_TO_EPOCH) : NullDatum.get(); + } + + @Override + void skipRows(long items) throws IOException { + reader.skip(countNonNulls(items)); + } + } + + /** + * A tree reader that will read string columns. At the start of the + * stripe, it creates an internal reader based on whether a direct or + * dictionary encoding was used. + */ + public static class StringTreeReader extends DatumTreeReader { + protected RawStringTreeReader reader; + + StringTreeReader(int columnId) throws IOException { + super(columnId); + } + + protected StringTreeReader(int columnId, InStream present, InStream data, InStream length, + InStream dictionary, OrcProto.ColumnEncoding encoding) throws IOException { + super(columnId, present); + if (encoding != null) { + switch (encoding.getKind()) { + case DIRECT: + case DIRECT_V2: + reader = new StringDirectTreeReader(columnId, present, data, length, + encoding.getKind()); + break; + case DICTIONARY: + case DICTIONARY_V2: + reader = new StringDictionaryTreeReader(columnId, present, data, length, dictionary, + encoding); + break; + default: + throw new IllegalArgumentException("Unsupported encoding " + + encoding.getKind()); + } + } + } + + @Override + void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException { + reader.checkEncoding(encoding); + } + + @Override + void startStripe(Map streams, + OrcProto.StripeFooter stripeFooter + ) throws IOException { + // For each stripe, checks the encoding and initializes the appropriate + // reader + switch (stripeFooter.getColumnsList().get(columnId).getKind()) { + case DIRECT: + case DIRECT_V2: + reader = new StringDirectTreeReader(columnId); + break; + case DICTIONARY: + case DICTIONARY_V2: + reader = new StringDictionaryTreeReader(columnId); + break; + default: + throw new IllegalArgumentException("Unsupported encoding " + + stripeFooter.getColumnsList().get(columnId).getKind()); + } + reader.startStripe(streams, stripeFooter); + } + + @Override + void seek(PositionProvider[] index) throws IOException { + reader.seek(index); + } + + @Override + public void seek(PositionProvider index) throws IOException { + reader.seek(index); + } + + @Override + Datum next() throws IOException { + byte[] bytes = reader.next(); + return bytes == null ? NullDatum.get() : DatumFactory.createText(bytes); + } + + @Override + void skipRows(long items) throws IOException { + reader.skipRows(items); + } + } + + private final static class BasicTextReaderShim { + private final InputStream in; + + public BasicTextReaderShim(InputStream in) { + this.in = in; + } + + public byte[] read(int len) throws IOException { + int offset = 0; + byte[] bytes = new byte[len]; + while (len > 0) { + int written = in.read(bytes, offset, len); + if (written < 0) { + throw new EOFException("Can't finish read from " + in + " read " + + (offset) + " bytes out of " + bytes.length); + } + len -= written; + offset += written; + } + return bytes; + } + } + + /** + * A reader for string columns that are direct encoded in the current + * stripe. + */ + public static class StringDirectTreeReader extends RawStringTreeReader { + protected InStream stream; + protected BasicTextReaderShim data; + protected IntegerReader lengths; + private final LongColumnVector scratchlcv; + + StringDirectTreeReader(int columnId) throws IOException { + this(columnId, null, null, null, null); + } + + protected StringDirectTreeReader(int columnId, InStream present, InStream data, + InStream length, OrcProto.ColumnEncoding.Kind encoding) throws IOException { + super(columnId, present); + this.scratchlcv = new LongColumnVector(); + this.stream = data; + if (length != null && encoding != null) { + this.lengths = createIntegerReader(encoding, length, false, false); + } + } + + @Override + void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException { + if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT && + encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2) { + throw new IOException("Unknown encoding " + encoding + " in column " + + columnId); + } + } + + @Override + void startStripe(Map streams, + OrcProto.StripeFooter stripeFooter + ) throws IOException { + super.startStripe(streams, stripeFooter); + org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId, + OrcProto.Stream.Kind.DATA); + stream = streams.get(name); + data = new BasicTextReaderShim(stream); + + lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(), + streams.get(new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.LENGTH)), + false, false); + } + + @Override + void seek(PositionProvider[] index) throws IOException { + seek(index[columnId]); + } + + @Override + public void seek(PositionProvider index) throws IOException { + super.seek(index); + stream.seek(index); + // don't seek data stream + lengths.seek(index); + } + + @Override + byte[] next() throws IOException { + super.next(); + int len = (int) lengths.next(); + return valuePresent ? data.read(len) : null; + } + + @Override + void skipRows(long items) throws IOException { + items = countNonNulls(items); + long lengthToSkip = 0; + for (int i = 0; i < items; ++i) { + lengthToSkip += lengths.next(); + } + + while (lengthToSkip > 0) { + lengthToSkip -= stream.skip(lengthToSkip); + } + } + + public IntegerReader getLengths() { + return lengths; + } + + public InStream getStream() { + return stream; + } + } + + /** + * A reader for string columns that are dictionary encoded in the current + * stripe. + */ + public static class StringDictionaryTreeReader extends RawStringTreeReader { + private org.apache.orc.impl.DynamicByteArray dictionaryBuffer; + private int[] dictionaryOffsets; + protected IntegerReader reader; + + private byte[] dictionaryBufferInBytesCache = null; + private final LongColumnVector scratchlcv; + private final Text result = new Text(); + + StringDictionaryTreeReader(int columnId) throws IOException { + this(columnId, null, null, null, null, null); + } + + protected StringDictionaryTreeReader(int columnId, InStream present, InStream data, + InStream length, InStream dictionary, OrcProto.ColumnEncoding encoding) + throws IOException { + super(columnId, present); + scratchlcv = new LongColumnVector(); + if (data != null && encoding != null) { + this.reader = createIntegerReader(encoding.getKind(), data, false, false); + } + + if (dictionary != null && encoding != null) { + readDictionaryStream(dictionary); + } + + if (length != null && encoding != null) { + readDictionaryLengthStream(length, encoding); + } + } + + @Override + void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException { + if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY && + encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) { + throw new IOException("Unknown encoding " + encoding + " in column " + + columnId); + } + } + + @Override + void startStripe(Map streams, + OrcProto.StripeFooter stripeFooter + ) throws IOException { + super.startStripe(streams, stripeFooter); + + // read the dictionary blob + org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId, + OrcProto.Stream.Kind.DICTIONARY_DATA); + InStream in = streams.get(name); + readDictionaryStream(in); + + // read the lengths + name = new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.LENGTH); + in = streams.get(name); + readDictionaryLengthStream(in, stripeFooter.getColumnsList().get(columnId)); + + // set up the row reader + name = new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.DATA); + reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(), + streams.get(name), false, false); + } + + private void readDictionaryLengthStream(InStream in, OrcProto.ColumnEncoding encoding) + throws IOException { + int dictionarySize = encoding.getDictionarySize(); + if (in != null) { // Guard against empty LENGTH stream. + IntegerReader lenReader = createIntegerReader(encoding.getKind(), in, false, false); + int offset = 0; + if (dictionaryOffsets == null || + dictionaryOffsets.length < dictionarySize + 1) { + dictionaryOffsets = new int[dictionarySize + 1]; + } + for (int i = 0; i < dictionarySize; ++i) { + dictionaryOffsets[i] = offset; + offset += (int) lenReader.next(); + } + dictionaryOffsets[dictionarySize] = offset; + in.close(); + } + + } + + private void readDictionaryStream(InStream in) throws IOException { + if (in != null) { // Guard against empty dictionary stream. + if (in.available() > 0) { + dictionaryBuffer = new DynamicByteArray(64, in.available()); + dictionaryBuffer.readAll(in); + // Since its start of strip invalidate the cache. + dictionaryBufferInBytesCache = null; + } + in.close(); + } else { + dictionaryBuffer = null; + } + } + + @Override + void seek(PositionProvider[] index) throws IOException { + seek(index[columnId]); + } + + @Override + public void seek(PositionProvider index) throws IOException { + super.seek(index); + reader.seek(index); + } + + @Override + byte[] next() throws IOException { + super.next(); + if (valuePresent) { + int entry = (int) reader.next(); + int offset = dictionaryOffsets[entry]; + int length = getDictionaryEntryLength(entry, offset); + // If the column is just empty strings, the size will be zero, + // so the buffer will be null, in that case just return result + // as it will default to empty + if (dictionaryBuffer != null) { + dictionaryBuffer.setText(result, offset, length); + } else { + result.clear(); + } + return result.getBytes(); + } else { + return null; + } + } + + int getDictionaryEntryLength(int entry, int offset) { + final int length; + // if it isn't the last entry, subtract the offsets otherwise use + // the buffer length. + if (entry < dictionaryOffsets.length - 1) { + length = dictionaryOffsets[entry + 1] - offset; + } else { + length = dictionaryBuffer.size() - offset; + } + return length; + } + + @Override + void skipRows(long items) throws IOException { + reader.skip(countNonNulls(items)); + } + + public IntegerReader getReader() { + return reader; + } + } + + /** + * A tree reader that will read string columns. At the start of the + * stripe, it creates an internal reader based on whether a direct or + * dictionary encoding was used. + */ + public static class CharTreeReader extends DatumTreeReader { + protected RawStringTreeReader reader; + private final int maxLength; + + CharTreeReader(int columnId, int maxLength) throws IOException { + this(columnId, null, null, null, null, null, maxLength); + } + + protected CharTreeReader(int columnId, InStream present, InStream data, InStream length, + InStream dictionary, OrcProto.ColumnEncoding encoding, int maxLength) throws IOException { + super(columnId, present); + this.maxLength = maxLength; + if (encoding != null) { + switch (encoding.getKind()) { + case DIRECT: + case DIRECT_V2: + reader = new StringDirectTreeReader(columnId, present, data, length, + encoding.getKind()); + break; + case DICTIONARY: + case DICTIONARY_V2: + reader = new StringDictionaryTreeReader(columnId, present, data, length, dictionary, + encoding); + break; + default: + throw new IllegalArgumentException("Unsupported encoding " + + encoding.getKind()); + } + } + } + + @Override + void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException { + reader.checkEncoding(encoding); + } + + @Override + void startStripe(Map streams, + OrcProto.StripeFooter stripeFooter + ) throws IOException { + // For each stripe, checks the encoding and initializes the appropriate + // reader + switch (stripeFooter.getColumnsList().get(columnId).getKind()) { + case DIRECT: + case DIRECT_V2: + reader = new StringDirectTreeReader(columnId); + break; + case DICTIONARY: + case DICTIONARY_V2: + reader = new StringDictionaryTreeReader(columnId); + break; + default: + throw new IllegalArgumentException("Unsupported encoding " + + stripeFooter.getColumnsList().get(columnId).getKind()); + } + reader.startStripe(streams, stripeFooter); + } + + @Override + void seek(PositionProvider[] index) throws IOException { + reader.seek(index); + } + + @Override + public void seek(PositionProvider index) throws IOException { + reader.seek(index); + } + + @Override + Datum next() throws IOException { + byte[] bytes = reader.next(); + + if (bytes == null) { + return NullDatum.get(); + } + // TODO: enforce char length + return DatumFactory.createChar(bytes); + } + + @Override + void skipRows(long items) throws IOException { + reader.skipRows(items); + } + } + +// protected static class StructTreeReader extends TreeReader { +// private final int fileColumnCount; +// private final int resultColumnCount; +// protected final TreeReader[] fields; +// private final String[] fieldNames; +// +// protected StructTreeReader( +// int columnId, +// TreeReaderSchema treeReaderSchema, +// boolean[] included, +// boolean skipCorrupt) throws IOException { +// super(columnId); +// +// OrcProto.Type fileStructType = treeReaderSchema.getFileTypes().get(columnId); +// fileColumnCount = fileStructType.getFieldNamesCount(); +// +// OrcProto.Type schemaStructType = treeReaderSchema.getSchemaTypes().get(columnId); +// +// if (columnId == treeReaderSchema.getInnerStructSubtype()) { +// // If there are more result columns than reader columns, we will default those additional +// // columns to NULL. +// resultColumnCount = schemaStructType.getFieldNamesCount(); +// } else { +// resultColumnCount = fileColumnCount; +// } +// +// this.fields = new TreeReader[fileColumnCount]; +// this.fieldNames = new String[fileColumnCount]; +// +// if (included == null) { +// for (int i = 0; i < fileColumnCount; ++i) { +// int subtype = schemaStructType.getSubtypes(i); +// this.fields[i] = createTreeReader(subtype, treeReaderSchema, included, skipCorrupt); +// // Use the treeReaderSchema evolution name since file/reader types may not have the real column name. +// this.fieldNames[i] = schemaStructType.getFieldNames(i); +// } +// } else { +// for (int i = 0; i < fileColumnCount; ++i) { +// int subtype = schemaStructType.getSubtypes(i); +// if (subtype >= included.length) { +// throw new IOException("subtype " + subtype + " exceeds the included array size " + +// included.length + " fileTypes " + treeReaderSchema.getFileTypes().toString() + +// " schemaTypes " + treeReaderSchema.getSchemaTypes().toString() + +// " innerStructSubtype " + treeReaderSchema.getInnerStructSubtype()); +// } +// if (included[subtype]) { +// this.fields[i] = createTreeReader(subtype, treeReaderSchema, included, skipCorrupt); +// } +// // Use the treeReaderSchema evolution name since file/reader types may not have the real column name. +// this.fieldNames[i] = schemaStructType.getFieldNames(i); +// } +// } +// } +// +// @Override +// void seek(PositionProvider[] index) throws IOException { +// super.seek(index); +// for (TreeReader kid : fields) { +// if (kid != null) { +// kid.seek(index); +// } +// } +// } +// +// @Override +// Object next(Object previous) throws IOException { +// super.next(previous); +// OrcStruct result = null; +// if (valuePresent) { +// if (previous == null) { +// result = new OrcStruct(resultColumnCount); +// } else { +// result = (OrcStruct) previous; +// +// // If the input format was initialized with a file with a +// // different number of fields, the number of fields needs to +// // be updated to the correct number +// if (result.getNumFields() != resultColumnCount) { +// result.setNumFields(resultColumnCount); +// } +// } +// for (int i = 0; i < fileColumnCount; ++i) { +// if (fields[i] != null) { +// result.setFieldValue(i, fields[i].next(result.getFieldValue(i))); +// } +// } +// if (resultColumnCount > fileColumnCount) { +// for (int i = fileColumnCount; i < resultColumnCount; ++i) { +// // Default new treeReaderSchema evolution fields to NULL. +// result.setFieldValue(i, null); +// } +// } +// } +// return result; +// } +// +// @Override +// void startStripe(Map streams, +// OrcProto.StripeFooter stripeFooter +// ) throws IOException { +// super.startStripe(streams, stripeFooter); +// for (TreeReader field : fields) { +// if (field != null) { +// field.startStripe(streams, stripeFooter); +// } +// } +// } +// +// @Override +// void skipRows(long items) throws IOException { +// items = countNonNulls(items); +// for (TreeReader field : fields) { +// if (field != null) { +// field.skipRows(items); +// } +// } +// } +// } + + public static DatumTreeReader createTreeReader(TableMeta meta, + int columnId, + Column column, + boolean skipCorrupt + ) throws IOException { + TypeDesc typeDesc = column.getTypeDesc(); + int orcColumnId = columnId + 1; // root record column is considered + switch (typeDesc.getDataType().getType()) { + case BOOLEAN: + return new BooleanTreeReader(orcColumnId); + case BIT: + return new ByteTreeReader(orcColumnId); + case FLOAT8: + return new DoubleTreeReader(orcColumnId); + case FLOAT4: + return new FloatTreeReader(orcColumnId); + case INT2: + return new ShortTreeReader(orcColumnId); + case INT4: + return new IntTreeReader(orcColumnId); + case INT8: + return new LongTreeReader(orcColumnId, skipCorrupt); + case TEXT: + return new StringTreeReader(orcColumnId); + case CHAR: + return new CharTreeReader(orcColumnId, typeDesc.getDataType().getLength()); + case BLOB: + return new BinaryTreeReader(orcColumnId); + case TIMESTAMP: + return new TimestampTreeReader(meta, orcColumnId, skipCorrupt); + case DATE: + return new DateTreeReader(orcColumnId); + case INET4: + return new InetTreeReader(orcColumnId); +// case STRUCT: +// return new StructTreeReader(columnId, treeReaderSchema, included, skipCorrupt); + default: + throw new TajoRuntimeException(new UnsupportedException("Unsupported type " + + typeDesc.getDataType().getType().name())); + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java index 833d102744..4cf008a3a9 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java @@ -45,6 +45,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.Text; +import org.apache.tajo.unit.TimeUnit; import org.apache.tajo.util.datetime.DateTimeUtil; import java.io.IOException; @@ -1467,7 +1468,6 @@ void recordPosition(PositionRecorder recorder) throws IOException { } } - static final int MILLIS_PER_SECOND = 1000; static final String BASE_TIMESTAMP_STRING = "2015-01-01 00:00:00"; private static class TimestampTreeWriter extends TreeWriter { @@ -1489,7 +1489,7 @@ private static class TimestampTreeWriter extends TreeWriter { OrcProto.Stream.Kind.SECONDARY), false, isDirectV2, writer); recordPosition(rowIndexPosition); // for unit tests to set different time zones - this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / MILLIS_PER_SECOND; + this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / TimeUnit.MILLIS_PER_SECOND; writer.useWriterTimeZone(true); timeZone = writer.getTimeZone(); } @@ -1515,7 +1515,7 @@ void write(Datum datum) throws IOException { Timestamp val = new Timestamp(javaTimestamp); indexStatistics.updateTimestamp(val); - seconds.write((val.getTime() / MILLIS_PER_SECOND) - base_timestamp); + seconds.write((val.getTime() / TimeUnit.MILLIS_PER_SECOND) - base_timestamp); nanos.write(formatNanos(val.getNanos())); if (createBloomFilter) { bloomFilter.addLong(val.getTime()); diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java index b63b497d5b..cc3f46399b 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java @@ -38,6 +38,7 @@ import org.apache.tajo.storage.fragment.FileFragment; import org.apache.tajo.storage.sequencefile.SequenceFileScanner; import org.apache.tajo.storage.text.DelimitedTextFile; +import org.apache.tajo.storage.thirdparty.orc.OrcFile.OrcTableProperties; import org.apache.tajo.util.CommonTestingUtil; import org.junit.Test; import org.junit.runner.RunWith; @@ -61,6 +62,7 @@ public class TestCompressionStorages { public TestCompressionStorages(String type) throws IOException { this.dataFormat = type; conf = new TajoConf(); + conf.setBoolean("hive.exec.orc.zerocopy", true); testDir = CommonTestingUtil.getTestDir(TEST_PATH); fs = testDir.getFileSystem(conf); @@ -71,7 +73,8 @@ public static Collection generateParameters() { return Arrays.asList(new Object[][]{ {BuiltinStorages.TEXT}, {BuiltinStorages.RCFILE}, - {BuiltinStorages.SEQUENCE_FILE} + {BuiltinStorages.SEQUENCE_FILE}, + {BuiltinStorages.ORC} }); } @@ -120,6 +123,14 @@ private void storageCompressionTest(String dataFormat, Class tajo.storage.scanner-handler.orc.class - org.apache.tajo.storage.orc.ORCScanner + org.apache.tajo.storage.orc.OrcScanner From 9b555db7b38c6b85fd74926a23484b3aaa02c82b Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Sat, 19 Mar 2016 23:05:02 +0900 Subject: [PATCH 02/16] Clean up dependency. --- .../org/apache/tajo/catalog/TypeDesc.java | 4 + .../apache/tajo/storage/StorageConstants.java | 6 +- .../java/org/apache/tajo/unit/TimeUnit.java | 2 - tajo-storage/tajo-storage-hdfs/pom.xml | 38 +- .../apache/tajo/storage/orc/ORCAppender.java | 82 +- .../apache/tajo/storage/orc/OrcScanner.java | 65 +- .../orc/BinaryColumnStatistics.java | 25 - .../thirdparty/orc/BitFieldWriter.java | 69 -- .../storage/thirdparty/orc/BloomFilterIO.java | 42 - .../orc/BooleanColumnStatistics.java | 27 - .../thirdparty/orc/ColumnStatistics.java | 36 - .../thirdparty/orc/ColumnStatisticsImpl.java | 1017 ----------------- .../thirdparty/orc/CompressionCodec.java | 68 -- .../thirdparty/orc/CompressionKind.java | 27 - .../thirdparty/orc/DateColumnStatistics.java | 37 - .../orc/DecimalColumnStatistics.java | 45 - .../orc/DirectDecompressionCodec.java | 26 - .../orc/DoubleColumnStatistics.java | 44 - .../thirdparty/orc/DynamicByteArray.java | 303 ----- .../thirdparty/orc/DynamicIntArray.java | 142 --- .../thirdparty/orc/HdfsOrcDataSource.java | 133 --- .../orc/IntegerColumnStatistics.java | 50 - .../storage/thirdparty/orc/IntegerWriter.java | 47 - .../storage/thirdparty/orc/MemoryManager.java | 212 ---- .../tajo/storage/thirdparty/orc/Metadata.java | 45 - .../thirdparty/orc/MetadataReader.java | 128 --- .../tajo/storage/thirdparty/orc/OrcFile.java | 387 ++++--- .../thirdparty/orc/OrcRecordReader.java | 29 +- .../tajo/storage/thirdparty/orc/OrcUtils.java | 63 +- .../storage/thirdparty/orc/OutStream.java | 286 ----- .../thirdparty/orc/PositionRecorder.java | 25 - .../orc/PositionedOutputStream.java | 38 - .../storage/thirdparty/orc/RedBlackTree.java | 309 ----- .../thirdparty/orc/RunLengthByteWriter.java | 106 -- .../orc/RunLengthIntegerWriter.java | 143 --- .../orc/RunLengthIntegerWriterV2.java | 832 -------------- .../thirdparty/orc/SerializationUtils.java | 844 -------------- .../storage/thirdparty/orc/SnappyCodec.java | 109 -- .../storage/thirdparty/orc/StreamName.java | 95 -- .../orc/StringColumnStatistics.java | 41 - .../thirdparty/orc/StringRedBlackTree.java | 202 ---- .../thirdparty/orc/StripeInformation.java | 59 - .../thirdparty/orc/StripeStatistics.java | 42 - .../orc/TimestampColumnStatistics.java | 38 - .../thirdparty/orc/TreeReaderFactory.java | 3 +- .../tajo/storage/thirdparty/orc/Writer.java | 2 + .../storage/thirdparty/orc/WriterImpl.java | 814 ++++++------- .../storage/thirdparty/orc/ZlibCodec.java | 169 --- .../src/main/proto/orc_proto.proto | 4 +- .../tajo/storage/TestCompressionStorages.java | 8 +- .../resources/dataset/testVariousTypes.avsc | 3 +- 51 files changed, 843 insertions(+), 6528 deletions(-) delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BinaryColumnStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BitFieldWriter.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BloomFilterIO.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanColumnStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/CompressionCodec.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/CompressionKind.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DateColumnStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DecimalColumnStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DirectDecompressionCodec.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleColumnStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DynamicByteArray.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DynamicIntArray.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/IntegerColumnStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/IntegerWriter.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MemoryManager.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Metadata.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MetadataReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OutStream.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionRecorder.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionedOutputStream.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RedBlackTree.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthByteWriter.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthIntegerWriter.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthIntegerWriterV2.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SerializationUtils.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SnappyCodec.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamName.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringColumnStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringRedBlackTree.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeInformation.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TimestampColumnStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZlibCodec.java diff --git a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/TypeDesc.java b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/TypeDesc.java index 3bd0f006a6..3ca83f987b 100644 --- a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/TypeDesc.java +++ b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/TypeDesc.java @@ -55,6 +55,10 @@ public boolean equals(Object obj) { } } + public Schema getNestedSchema() { + return nestedRecordSchema; + } + public int hashCode() { return Objects.hashCode(dataType.hashCode(), nestedRecordSchema); } diff --git a/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java b/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java index 097963cb25..4612323deb 100644 --- a/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java +++ b/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java @@ -89,11 +89,7 @@ public class StorageConstants { public static final String DEFAULT_ORC_STRIPE_SIZE = "67108864"; // 64MB public static final String ORC_COMPRESSION = "orc.compress"; - public static final String ORC_COMPRESSION_KIND_NONE = "none"; - public static final String ORC_COMPRESSION_KIND_SNAPPY = "snappy"; - public static final String ORC_COMPRESSION_KIND_LZO = "lzo"; - public static final String ORC_COMPRESSION_KIND_ZIP = "zlip"; - public static final String DEFAULT_ORC_COMPRESSION_KIND = ORC_COMPRESSION_KIND_NONE; + public static final String DEFAULT_ORC_COMPRESSION_KIND = "none"; public static final String ORC_BUFFER_SIZE = "orc.buffer.size"; public static final String DEFAULT_ORC_BUFFER_SIZE = "262144"; // 256KB diff --git a/tajo-common/src/main/java/org/apache/tajo/unit/TimeUnit.java b/tajo-common/src/main/java/org/apache/tajo/unit/TimeUnit.java index a03a930d78..8062f2de5a 100644 --- a/tajo-common/src/main/java/org/apache/tajo/unit/TimeUnit.java +++ b/tajo-common/src/main/java/org/apache/tajo/unit/TimeUnit.java @@ -26,6 +26,4 @@ public class TimeUnit { public static final int DAY = HOUR * 24; public static final int PART_UNIT = 5*TimeUnit.MIN; - - public static final int MILLIS_PER_SECOND = 1000; } diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index 2c4538a6ba..2c5da75ef7 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -129,7 +129,7 @@ --proto_path=../../tajo-catalog/tajo-catalog-common/src/main/proto --java_out=target/generated-sources/proto src/main/proto/StorageFragmentProtos.proto - src/main/proto/orc_proto.proto + @@ -344,11 +344,6 @@ io.netty netty-buffer - - com.facebook.presto - presto-orc - 0.141 - org.apache.hive hive-orc @@ -367,36 +362,25 @@ log4j-1.2-api org.apache.logging.log4j - - - - org.apache.hive - hive-exec - ${hive.version} - - log4j-1.2-api - org.apache.logging.log4j + hive-common + org.apache.hive - log4j-slf4j-impl - org.apache.logging.log4j + libthrift + org.apache.thrift - antlr-runtime - org.antlr + opencsv + net.sf.opencsv - jline - jline - - - calcite-core - org.apache.calcite + hadoop-yarn-server-resourcemanager + org.apache.hadoop - calcite-avatica - org.apache.calcite + hive-shims-scheduler + org.apache.hive.shims diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java index 7999d02487..ec4349628c 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java @@ -20,6 +20,10 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.orc.CompressionKind; +import org.apache.orc.OrcConf; +import org.apache.orc.TypeDescription; import org.apache.tajo.TajoConstants; import org.apache.tajo.TaskAttemptId; import org.apache.tajo.catalog.Schema; @@ -29,12 +33,13 @@ import org.apache.tajo.storage.StorageConstants; import org.apache.tajo.storage.TableStatistics; import org.apache.tajo.storage.Tuple; -import org.apache.tajo.storage.orc.objectinspector.ObjectInspectorFactory; -import org.apache.tajo.storage.thirdparty.orc.CompressionKind; import org.apache.tajo.storage.thirdparty.orc.OrcFile; +import org.apache.tajo.storage.thirdparty.orc.OrcFile.EncodingStrategy; +import org.apache.tajo.storage.thirdparty.orc.OrcUtils; import org.apache.tajo.storage.thirdparty.orc.Writer; import java.io.IOException; +import java.util.Properties; import java.util.TimeZone; public class ORCAppender extends FileAppender { @@ -52,15 +57,7 @@ public ORCAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schem @Override public void init() throws IOException { - writer = OrcFile.createWriter(workDir.getFileSystem(conf), path, conf, - ObjectInspectorFactory.buildStructObjectInspector(schema), - Long.parseLong(meta.getProperty(StorageConstants.ORC_STRIPE_SIZE, - StorageConstants.DEFAULT_ORC_STRIPE_SIZE)), getCompressionKind(), - Integer.parseInt(meta.getProperty(StorageConstants.ORC_BUFFER_SIZE, - StorageConstants.DEFAULT_ORC_BUFFER_SIZE)), - Integer.parseInt(meta.getProperty(StorageConstants.ORC_ROW_INDEX_STRIDE, - StorageConstants.DEFAULT_ORC_ROW_INDEX_STRIDE)), - timezone); + writer = OrcFile.createWriter(path, buildWriterOptions(conf, meta, schema), timezone); if (tableStatsEnabled) { this.stats = new TableStatistics(schema, columnStatsEnabled); @@ -110,21 +107,76 @@ public long getEstimatedOutputSize() throws IOException { return writer.getRawDataSize() * writer.getNumberOfRows(); } - private CompressionKind getCompressionKind() { + private static OrcFile.WriterOptions buildWriterOptions(Configuration conf, TableMeta meta, Schema schema) { + return OrcFile.writerOptions(conf) + .setSchema(OrcUtils.convertSchema(schema)) + .compress(getCompressionKind(meta)) + .stripeSize(Long.parseLong(meta.getProperty(OrcConf.STRIPE_SIZE.getAttribute(), String.valueOf(OrcConf.STRIPE_SIZE.getDefaultValue())))) + .blockSize(Long.parseLong(meta.getProperty(OrcConf.BLOCK_SIZE.getAttribute(), String.valueOf(OrcConf.BLOCK_SIZE.getDefaultValue())))) + .rowIndexStride(Integer.parseInt(meta.getProperty(OrcConf.ROW_INDEX_STRIDE.getAttribute(), String.valueOf(OrcConf.ROW_INDEX_STRIDE.getDefaultValue())))) + .bufferSize(Integer.parseInt(meta.getProperty(OrcConf.BUFFER_SIZE.getAttribute(), String.valueOf(OrcConf.BUFFER_SIZE.getDefaultValue())))) + .blockPadding(Boolean.parseBoolean(meta.getProperty(OrcConf.BLOCK_PADDING.getAttribute(), String.valueOf(OrcConf.BLOCK_PADDING.getDefaultValue())))) + .encodingStrategy(EncodingStrategy.valueOf(meta.getProperty(OrcConf.ENCODING_STRATEGY.getAttribute(), String.valueOf(OrcConf.ENCODING_STRATEGY.getDefaultValue())))) + .bloomFilterFpp(Double.parseDouble(meta.getProperty(OrcConf.BLOOM_FILTER_FPP.getAttribute(), String.valueOf(OrcConf.BLOOM_FILTER_FPP.getDefaultValue())))) + .bloomFilterColumns(meta.getProperty(OrcConf.BLOOM_FILTER_COLUMNS.getAttribute(), String.valueOf(OrcConf.BLOOM_FILTER_COLUMNS.getDefaultValue()))); + } + + private static CompressionKind getCompressionKind(TableMeta meta) { String kindstr = meta.getProperty(StorageConstants.ORC_COMPRESSION, StorageConstants.DEFAULT_ORC_COMPRESSION_KIND); - if (kindstr.equalsIgnoreCase(StorageConstants.ORC_COMPRESSION_KIND_ZIP)) { + if (kindstr.equalsIgnoreCase(CompressionKind.ZLIB.name())) { return CompressionKind.ZLIB; } - if (kindstr.equalsIgnoreCase(StorageConstants.ORC_COMPRESSION_KIND_SNAPPY)) { + if (kindstr.equalsIgnoreCase(CompressionKind.SNAPPY.name())) { return CompressionKind.SNAPPY; } - if (kindstr.equalsIgnoreCase(StorageConstants.ORC_COMPRESSION_KIND_LZO)) { + if (kindstr.equalsIgnoreCase(CompressionKind.LZO.name())) { return CompressionKind.LZO; } return CompressionKind.NONE; } + + /** + * Options for creating ORC file writers. + */ + public static class WriterOptions extends OrcFile.WriterOptions { + private boolean explicitSchema = false; + private ObjectInspector inspector = null; + // Setting the default batch size to 1000 makes the memory check at 5000 + // rows work the same as the row by row writer. (If it was the default 1024, + // the smallest stripe size would be 5120 rows, which changes the output + // of some of the tests.) + private int batchSize = 1000; + + public WriterOptions(Properties tableProperties, Configuration conf) { + super(tableProperties, conf); + } + + /** + * Set the schema for the file. This is a required parameter. + * @param schema the schema for the file. + * @return this + */ + public WriterOptions setSchema(TypeDescription schema) { + this.explicitSchema = true; + super.setSchema(schema); + return this; + } + + protected WriterOptions batchSize(int maxSize) { + batchSize = maxSize; + return this; + } + + ObjectInspector getInspector() { + return inspector; + } + + int getBatchSize() { + return batchSize; + } + } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 2dc979b168..8082819bff 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -28,9 +28,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.io.DiskRange; import org.apache.hadoop.io.Text; -import org.apache.orc.CompressionCodec; -import org.apache.orc.FileMetaInfo; -import org.apache.orc.OrcProto; +import org.apache.orc.*; import org.apache.orc.impl.BufferChunk; import org.apache.orc.impl.InStream; import org.apache.tajo.catalog.Schema; @@ -39,12 +37,12 @@ import org.apache.tajo.storage.FileScanner; import org.apache.tajo.storage.Tuple; import org.apache.tajo.storage.fragment.Fragment; -import org.apache.tajo.storage.thirdparty.orc.OrcFile; import org.apache.tajo.storage.thirdparty.orc.OrcRecordReader; import org.apache.tajo.storage.thirdparty.orc.OrcUtils; import java.io.IOException; import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.List; public class OrcScanner extends FileScanner { @@ -63,7 +61,7 @@ public class OrcScanner extends FileScanner { protected List types; private List userMetadata; private List fileStats; - private List stripes; + private List stripes; protected int rowIndexStride; private long contentLength, numberOfRows; @@ -264,7 +262,7 @@ public void init() throws IOException { this.userMetadata = rInfo.footer.getMetadataList(); this.fileStats = rInfo.footer.getStatisticsList(); this.versionList = footerMetaData.versionList; - this.stripes = rInfo.footer.getStripesList(); + this.stripes = convertProtoStripesToStripes(rInfo.footer.getStripesList()); recordReader = getRecordReader(); } @@ -394,4 +392,59 @@ private static class MetaInfoObjExtractor{ } } + public static class StripeInformationImpl + implements org.apache.orc.StripeInformation { + private final OrcProto.StripeInformation stripe; + + public StripeInformationImpl(OrcProto.StripeInformation stripe) { + this.stripe = stripe; + } + + @Override + public long getOffset() { + return stripe.getOffset(); + } + + @Override + public long getLength() { + return stripe.getDataLength() + getIndexLength() + getFooterLength(); + } + + @Override + public long getDataLength() { + return stripe.getDataLength(); + } + + @Override + public long getFooterLength() { + return stripe.getFooterLength(); + } + + @Override + public long getIndexLength() { + return stripe.getIndexLength(); + } + + @Override + public long getNumberOfRows() { + return stripe.getNumberOfRows(); + } + + @Override + public String toString() { + return "offset: " + getOffset() + " data: " + getDataLength() + + " rows: " + getNumberOfRows() + " tail: " + getFooterLength() + + " index: " + getIndexLength(); + } + } + + private static List convertProtoStripesToStripes( + List stripes) { + List result = new ArrayList<>(stripes.size()); + for (OrcProto.StripeInformation info : stripes) { + result.add(new StripeInformationImpl(info)); + } + return result; + } + } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BinaryColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BinaryColumnStatistics.java deleted file mode 100644 index bee29fb994..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BinaryColumnStatistics.java +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -/** - * Statistics for binary columns. - */ -public interface BinaryColumnStatistics extends ColumnStatistics { - long getSum(); -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BitFieldWriter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BitFieldWriter.java deleted file mode 100644 index 23719bd11e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BitFieldWriter.java +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import java.io.IOException; - -class BitFieldWriter { - private RunLengthByteWriter output; - private final int bitSize; - private byte current = 0; - private int bitsLeft = 8; - - BitFieldWriter(PositionedOutputStream output, - int bitSize) throws IOException { - this.output = new RunLengthByteWriter(output); - this.bitSize = bitSize; - } - - private void writeByte() throws IOException { - output.write(current); - current = 0; - bitsLeft = 8; - } - - void flush() throws IOException { - if (bitsLeft != 8) { - writeByte(); - } - output.flush(); - } - - void write(int value) throws IOException { - int bitsToWrite = bitSize; - while (bitsToWrite > bitsLeft) { - // add the bits to the bottom of the current word - current |= value >>> (bitsToWrite - bitsLeft); - // subtract out the bits we just added - bitsToWrite -= bitsLeft; - // zero out the bits above bitsToWrite - value &= (1 << bitsToWrite) - 1; - writeByte(); - } - bitsLeft -= bitsToWrite; - current |= value << bitsLeft; - if (bitsLeft == 0) { - writeByte(); - } - } - - void getPosition(PositionRecorder recorder) throws IOException { - output.getPosition(recorder); - recorder.addPosition(8 - bitsLeft); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BloomFilterIO.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BloomFilterIO.java deleted file mode 100644 index 9d7c09cfb6..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BloomFilterIO.java +++ /dev/null @@ -1,42 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.primitives.Longs; -import org.apache.tajo.storage.thirdparty.orc.util.BloomFilter; - -public class BloomFilterIO extends BloomFilter { - - public BloomFilterIO(long expectedEntries) { - super(expectedEntries, DEFAULT_FPP); - } - - public BloomFilterIO(long expectedEntries, double fpp) { - super(expectedEntries, fpp); - } - -/** - * Initializes the BloomFilter from the given Orc BloomFilter - */ - public BloomFilterIO(OrcProto.BloomFilter bloomFilter) { - this.bitSet = new BitSet(Longs.toArray(bloomFilter.getBitsetList())); - this.numHashFunctions = bloomFilter.getNumHashFunctions(); - this.numBits = (int) this.bitSet.bitSize(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanColumnStatistics.java deleted file mode 100644 index 0f55697339..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanColumnStatistics.java +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -/** - * Statistics for boolean columns. - */ -public interface BooleanColumnStatistics extends ColumnStatistics { - long getFalseCount(); - - long getTrueCount(); -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatistics.java deleted file mode 100644 index b317e41a42..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatistics.java +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -/** - * Statistics that are available for all types of columns. - */ -public interface ColumnStatistics { - /** - * Get the number of values in this column. It will differ from the number - * of rows because of NULL values and repeated values. - * @return the number of values - */ - long getNumberOfValues(); - - /** - * Returns true if there are nulls in the scope of column statistics. - * @return true if null present else false - */ - boolean hasNull(); -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java deleted file mode 100644 index d74f9893b3..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java +++ /dev/null @@ -1,1017 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.serde2.io.DateWritable; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.tajo.datum.Datum; - -import java.sql.Date; -import java.sql.Timestamp; - -class ColumnStatisticsImpl implements ColumnStatistics { - - private static final class BooleanStatisticsImpl extends ColumnStatisticsImpl - implements BooleanColumnStatistics { - private long trueCount = 0; - - BooleanStatisticsImpl(OrcProto.ColumnStatistics stats) { - super(stats); - OrcProto.BucketStatistics bkt = stats.getBucketStatistics(); - trueCount = bkt.getCount(0); - } - - BooleanStatisticsImpl() { - } - - @Override - void reset() { - super.reset(); - trueCount = 0; - } - - @Override - void updateBoolean(boolean value) { - if (value) { - trueCount += 1; - } - } - - @Override - void merge(ColumnStatisticsImpl other) { - if (other instanceof BooleanStatisticsImpl) { - BooleanStatisticsImpl bkt = (BooleanStatisticsImpl) other; - trueCount += bkt.trueCount; - } else { - if (isStatsExists() && trueCount != 0) { - throw new IllegalArgumentException("Incompatible merging of boolean column statistics"); - } - } - super.merge(other); - } - - @Override - OrcProto.ColumnStatistics.Builder serialize() { - OrcProto.ColumnStatistics.Builder builder = super.serialize(); - OrcProto.BucketStatistics.Builder bucket = - OrcProto.BucketStatistics.newBuilder(); - bucket.addCount(trueCount); - builder.setBucketStatistics(bucket); - return builder; - } - - @Override - public long getFalseCount() { - return getNumberOfValues() - trueCount; - } - - @Override - public long getTrueCount() { - return trueCount; - } - - @Override - public String toString() { - return super.toString() + " true: " + trueCount; - } - } - - private static final class IntegerStatisticsImpl extends ColumnStatisticsImpl - implements IntegerColumnStatistics { - - private long minimum = Long.MAX_VALUE; - private long maximum = Long.MIN_VALUE; - private long sum = 0; - private boolean hasMinimum = false; - private boolean overflow = false; - - IntegerStatisticsImpl() { - } - - IntegerStatisticsImpl(OrcProto.ColumnStatistics stats) { - super(stats); - OrcProto.IntegerStatistics intStat = stats.getIntStatistics(); - if (intStat.hasMinimum()) { - hasMinimum = true; - minimum = intStat.getMinimum(); - } - if (intStat.hasMaximum()) { - maximum = intStat.getMaximum(); - } - if (intStat.hasSum()) { - sum = intStat.getSum(); - } else { - overflow = true; - } - } - - @Override - void reset() { - super.reset(); - hasMinimum = false; - minimum = Long.MAX_VALUE; - maximum = Long.MIN_VALUE; - sum = 0; - overflow = false; - } - - @Override - void updateInteger(long value) { - if (!hasMinimum) { - hasMinimum = true; - minimum = value; - maximum = value; - } else if (value < minimum) { - minimum = value; - } else if (value > maximum) { - maximum = value; - } - if (!overflow) { - boolean wasPositive = sum >= 0; - sum += value; - if ((value >= 0) == wasPositive) { - overflow = (sum >= 0) != wasPositive; - } - } - } - - @Override - void merge(ColumnStatisticsImpl other) { - if (other instanceof IntegerStatisticsImpl) { - IntegerStatisticsImpl otherInt = (IntegerStatisticsImpl) other; - if (!hasMinimum) { - hasMinimum = otherInt.hasMinimum; - minimum = otherInt.minimum; - maximum = otherInt.maximum; - } else if (otherInt.hasMinimum) { - if (otherInt.minimum < minimum) { - minimum = otherInt.minimum; - } - if (otherInt.maximum > maximum) { - maximum = otherInt.maximum; - } - } - - overflow |= otherInt.overflow; - if (!overflow) { - boolean wasPositive = sum >= 0; - sum += otherInt.sum; - if ((otherInt.sum >= 0) == wasPositive) { - overflow = (sum >= 0) != wasPositive; - } - } - } else { - if (isStatsExists() && hasMinimum) { - throw new IllegalArgumentException("Incompatible merging of integer column statistics"); - } - } - super.merge(other); - } - - @Override - OrcProto.ColumnStatistics.Builder serialize() { - OrcProto.ColumnStatistics.Builder builder = super.serialize(); - OrcProto.IntegerStatistics.Builder intb = - OrcProto.IntegerStatistics.newBuilder(); - if (hasMinimum) { - intb.setMinimum(minimum); - intb.setMaximum(maximum); - } - if (!overflow) { - intb.setSum(sum); - } - builder.setIntStatistics(intb); - return builder; - } - - @Override - public long getMinimum() { - return minimum; - } - - @Override - public long getMaximum() { - return maximum; - } - - @Override - public boolean isSumDefined() { - return !overflow; - } - - @Override - public long getSum() { - return sum; - } - - @Override - public String toString() { - StringBuilder buf = new StringBuilder(super.toString()); - if (hasMinimum) { - buf.append(" min: "); - buf.append(minimum); - buf.append(" max: "); - buf.append(maximum); - } - if (!overflow) { - buf.append(" sum: "); - buf.append(sum); - } - return buf.toString(); - } - } - - private static final class DoubleStatisticsImpl extends ColumnStatisticsImpl - implements DoubleColumnStatistics { - private boolean hasMinimum = false; - private double minimum = Double.MAX_VALUE; - private double maximum = Double.MIN_VALUE; - private double sum = 0; - - DoubleStatisticsImpl() { - } - - DoubleStatisticsImpl(OrcProto.ColumnStatistics stats) { - super(stats); - OrcProto.DoubleStatistics dbl = stats.getDoubleStatistics(); - if (dbl.hasMinimum()) { - hasMinimum = true; - minimum = dbl.getMinimum(); - } - if (dbl.hasMaximum()) { - maximum = dbl.getMaximum(); - } - if (dbl.hasSum()) { - sum = dbl.getSum(); - } - } - - @Override - void reset() { - super.reset(); - hasMinimum = false; - minimum = Double.MAX_VALUE; - maximum = Double.MIN_VALUE; - sum = 0; - } - - @Override - void updateDouble(double value) { - if (!hasMinimum) { - hasMinimum = true; - minimum = value; - maximum = value; - } else if (value < minimum) { - minimum = value; - } else if (value > maximum) { - maximum = value; - } - sum += value; - } - - @Override - void merge(ColumnStatisticsImpl other) { - if (other instanceof DoubleStatisticsImpl) { - DoubleStatisticsImpl dbl = (DoubleStatisticsImpl) other; - if (!hasMinimum) { - hasMinimum = dbl.hasMinimum; - minimum = dbl.minimum; - maximum = dbl.maximum; - } else if (dbl.hasMinimum) { - if (dbl.minimum < minimum) { - minimum = dbl.minimum; - } - if (dbl.maximum > maximum) { - maximum = dbl.maximum; - } - } - sum += dbl.sum; - } else { - if (isStatsExists() && hasMinimum) { - throw new IllegalArgumentException("Incompatible merging of double column statistics"); - } - } - super.merge(other); - } - - @Override - OrcProto.ColumnStatistics.Builder serialize() { - OrcProto.ColumnStatistics.Builder builder = super.serialize(); - OrcProto.DoubleStatistics.Builder dbl = - OrcProto.DoubleStatistics.newBuilder(); - if (hasMinimum) { - dbl.setMinimum(minimum); - dbl.setMaximum(maximum); - } - dbl.setSum(sum); - builder.setDoubleStatistics(dbl); - return builder; - } - - @Override - public double getMinimum() { - return minimum; - } - - @Override - public double getMaximum() { - return maximum; - } - - @Override - public double getSum() { - return sum; - } - - @Override - public String toString() { - StringBuilder buf = new StringBuilder(super.toString()); - if (hasMinimum) { - buf.append(" min: "); - buf.append(minimum); - buf.append(" max: "); - buf.append(maximum); - } - buf.append(" sum: "); - buf.append(sum); - return buf.toString(); - } - } - - protected static final class StringStatisticsImpl extends ColumnStatisticsImpl - implements StringColumnStatistics { - private String minimum = null; - private String maximum = null; - private long sum = 0; - - StringStatisticsImpl() { - } - - StringStatisticsImpl(OrcProto.ColumnStatistics stats) { - super(stats); - OrcProto.StringStatistics str = stats.getStringStatistics(); - if (str.hasMaximum()) { - maximum = str.getMaximum(); - } - if (str.hasMinimum()) { - minimum = str.getMinimum(); - } - if(str.hasSum()) { - sum = str.getSum(); - } - } - - @Override - void reset() { - super.reset(); - minimum = null; - maximum = null; - sum = 0; - } - - @Override - void updateString(String value) { - if (minimum == null) { - maximum = minimum = value; - } else if (minimum.compareTo(value) > 0) { - minimum = value; - } else if (maximum.compareTo(value) < 0) { - maximum = value; - } - sum += value.length(); - } - - @Override - void merge(ColumnStatisticsImpl other) { - if (other instanceof StringStatisticsImpl) { - StringStatisticsImpl str = (StringStatisticsImpl) other; - if (minimum == null) { - if (str.minimum != null) { - maximum = str.getMaximum(); - minimum = str.getMinimum(); - } else { - /* both are empty */ - maximum = minimum = null; - } - } else if (str.minimum != null) { - if (minimum.compareTo(str.minimum) > 0) { - minimum = str.getMinimum(); - } - if (maximum.compareTo(str.maximum) < 0) { - maximum = str.getMaximum(); - } - } - sum += str.sum; - } else { - if (isStatsExists() && minimum != null) { - throw new IllegalArgumentException("Incompatible merging of string column statistics"); - } - } - super.merge(other); - } - - @Override - OrcProto.ColumnStatistics.Builder serialize() { - OrcProto.ColumnStatistics.Builder result = super.serialize(); - OrcProto.StringStatistics.Builder str = - OrcProto.StringStatistics.newBuilder(); - if (getNumberOfValues() != 0) { - str.setMinimum(getMinimum()); - str.setMaximum(getMaximum()); - str.setSum(sum); - } - result.setStringStatistics(str); - return result; - } - - @Override - public String getMinimum() { - return minimum; - } - - @Override - public String getMaximum() { - return maximum; - } - - @Override - public long getSum() { - return sum; - } - - @Override - public String toString() { - StringBuilder buf = new StringBuilder(super.toString()); - if (getNumberOfValues() != 0) { - buf.append(" min: "); - buf.append(getMinimum()); - buf.append(" max: "); - buf.append(getMaximum()); - buf.append(" sum: "); - buf.append(sum); - } - return buf.toString(); - } - } - - protected static final class BinaryStatisticsImpl extends ColumnStatisticsImpl implements - BinaryColumnStatistics { - - private long sum = 0; - - BinaryStatisticsImpl() { - } - - BinaryStatisticsImpl(OrcProto.ColumnStatistics stats) { - super(stats); - OrcProto.BinaryStatistics binStats = stats.getBinaryStatistics(); - if (binStats.hasSum()) { - sum = binStats.getSum(); - } - } - - @Override - void reset() { - super.reset(); - sum = 0; - } - - @Override - void updateBinary(Datum value) { - sum += value.size(); - } - - @Override - void merge(ColumnStatisticsImpl other) { - if (other instanceof BinaryColumnStatistics) { - BinaryStatisticsImpl bin = (BinaryStatisticsImpl) other; - sum += bin.sum; - } else { - if (isStatsExists() && sum != 0) { - throw new IllegalArgumentException("Incompatible merging of binary column statistics"); - } - } - super.merge(other); - } - - @Override - public long getSum() { - return sum; - } - - @Override - OrcProto.ColumnStatistics.Builder serialize() { - OrcProto.ColumnStatistics.Builder result = super.serialize(); - OrcProto.BinaryStatistics.Builder bin = OrcProto.BinaryStatistics.newBuilder(); - bin.setSum(sum); - result.setBinaryStatistics(bin); - return result; - } - - @Override - public String toString() { - StringBuilder buf = new StringBuilder(super.toString()); - if (getNumberOfValues() != 0) { - buf.append(" sum: "); - buf.append(sum); - } - return buf.toString(); - } - } - - private static final class DecimalStatisticsImpl extends ColumnStatisticsImpl - implements DecimalColumnStatistics { - private HiveDecimal minimum = null; - private HiveDecimal maximum = null; - private HiveDecimal sum = HiveDecimal.ZERO; - - DecimalStatisticsImpl() { - } - - DecimalStatisticsImpl(OrcProto.ColumnStatistics stats) { - super(stats); - OrcProto.DecimalStatistics dec = stats.getDecimalStatistics(); - if (dec.hasMaximum()) { - maximum = HiveDecimal.create(dec.getMaximum()); - } - if (dec.hasMinimum()) { - minimum = HiveDecimal.create(dec.getMinimum()); - } - if (dec.hasSum()) { - sum = HiveDecimal.create(dec.getSum()); - } else { - sum = null; - } - } - - @Override - void reset() { - super.reset(); - minimum = null; - maximum = null; - sum = HiveDecimal.ZERO; - } - - @Override - void updateDecimal(HiveDecimal value) { - if (minimum == null) { - minimum = value; - maximum = value; - } else if (minimum.compareTo(value) > 0) { - minimum = value; - } else if (maximum.compareTo(value) < 0) { - maximum = value; - } - if (sum != null) { - sum = sum.add(value); - } - } - - @Override - void merge(ColumnStatisticsImpl other) { - if (other instanceof DecimalStatisticsImpl) { - DecimalStatisticsImpl dec = (DecimalStatisticsImpl) other; - if (minimum == null) { - minimum = dec.minimum; - maximum = dec.maximum; - sum = dec.sum; - } else if (dec.minimum != null) { - if (minimum.compareTo(dec.minimum) > 0) { - minimum = dec.minimum; - } - if (maximum.compareTo(dec.maximum) < 0) { - maximum = dec.maximum; - } - if (sum == null || dec.sum == null) { - sum = null; - } else { - sum = sum.add(dec.sum); - } - } - } else { - if (isStatsExists() && minimum != null) { - throw new IllegalArgumentException("Incompatible merging of decimal column statistics"); - } - } - super.merge(other); - } - - @Override - OrcProto.ColumnStatistics.Builder serialize() { - OrcProto.ColumnStatistics.Builder result = super.serialize(); - OrcProto.DecimalStatistics.Builder dec = - OrcProto.DecimalStatistics.newBuilder(); - if (getNumberOfValues() != 0 && minimum != null) { - dec.setMinimum(minimum.toString()); - dec.setMaximum(maximum.toString()); - } - if (sum != null) { - dec.setSum(sum.toString()); - } - result.setDecimalStatistics(dec); - return result; - } - - @Override - public HiveDecimal getMinimum() { - return minimum; - } - - @Override - public HiveDecimal getMaximum() { - return maximum; - } - - @Override - public HiveDecimal getSum() { - return sum; - } - - @Override - public String toString() { - StringBuilder buf = new StringBuilder(super.toString()); - if (getNumberOfValues() != 0) { - buf.append(" min: "); - buf.append(minimum); - buf.append(" max: "); - buf.append(maximum); - if (sum != null) { - buf.append(" sum: "); - buf.append(sum); - } - } - return buf.toString(); - } - } - - private static final class DateStatisticsImpl extends ColumnStatisticsImpl - implements DateColumnStatistics { - private Integer minimum = null; - private Integer maximum = null; - - DateStatisticsImpl() { - } - - DateStatisticsImpl(OrcProto.ColumnStatistics stats) { - super(stats); - OrcProto.DateStatistics dateStats = stats.getDateStatistics(); - // min,max values serialized/deserialized as int (days since epoch) - if (dateStats.hasMaximum()) { - maximum = dateStats.getMaximum(); - } - if (dateStats.hasMinimum()) { - minimum = dateStats.getMinimum(); - } - } - - @Override - void reset() { - super.reset(); - minimum = null; - maximum = null; - } - - @Override - void updateDate(int daysSinceEpoch) { - if (minimum == null) { - minimum = daysSinceEpoch; - maximum = daysSinceEpoch; - } else if (minimum > daysSinceEpoch) { - minimum = daysSinceEpoch; - } else if (maximum < daysSinceEpoch) { - maximum = daysSinceEpoch; - } - } - - @Override - void merge(ColumnStatisticsImpl other) { - if (other instanceof DateStatisticsImpl) { - DateStatisticsImpl dateStats = (DateStatisticsImpl) other; - if (minimum == null) { - minimum = dateStats.minimum; - maximum = dateStats.maximum; - } else if (dateStats.minimum != null) { - if (minimum > dateStats.minimum) { - minimum = dateStats.minimum; - } - if (maximum < dateStats.maximum) { - maximum = dateStats.maximum; - } - } - } else { - if (isStatsExists() && minimum != null) { - throw new IllegalArgumentException("Incompatible merging of date column statistics"); - } - } - super.merge(other); - } - - @Override - OrcProto.ColumnStatistics.Builder serialize() { - OrcProto.ColumnStatistics.Builder result = super.serialize(); - OrcProto.DateStatistics.Builder dateStats = - OrcProto.DateStatistics.newBuilder(); - if (getNumberOfValues() != 0 && minimum != null) { - dateStats.setMinimum(minimum); - dateStats.setMaximum(maximum); - } - result.setDateStatistics(dateStats); - return result; - } - - private transient final DateWritable minDate = new DateWritable(); - private transient final DateWritable maxDate = new DateWritable(); - - @Override - public Date getMinimum() { - if (minimum == null) { - return null; - } - minDate.set(minimum); - return minDate.get(); - } - - @Override - public Date getMaximum() { - if (maximum == null) { - return null; - } - maxDate.set(maximum); - return maxDate.get(); - } - - @Override - public String toString() { - StringBuilder buf = new StringBuilder(super.toString()); - if (getNumberOfValues() != 0) { - buf.append(" min: "); - buf.append(getMinimum()); - buf.append(" max: "); - buf.append(getMaximum()); - } - return buf.toString(); - } - } - - private static final class TimestampStatisticsImpl extends ColumnStatisticsImpl - implements TimestampColumnStatistics { - private Long minimum = null; - private Long maximum = null; - - TimestampStatisticsImpl() { - } - - TimestampStatisticsImpl(OrcProto.ColumnStatistics stats) { - super(stats); - OrcProto.TimestampStatistics timestampStats = stats.getTimestampStatistics(); - // min,max values serialized/deserialized as int (milliseconds since epoch) - if (timestampStats.hasMaximum()) { - maximum = timestampStats.getMaximum(); - } - if (timestampStats.hasMinimum()) { - minimum = timestampStats.getMinimum(); - } - } - - @Override - void reset() { - super.reset(); - minimum = null; - maximum = null; - } - - @Override - void updateTimestamp(Timestamp value) { - if (minimum == null) { - minimum = value.getTime(); - maximum = value.getTime(); - } else if (minimum > value.getTime()) { - minimum = value.getTime(); - } else if (maximum < value.getTime()) { - maximum = value.getTime(); - } - } - - @Override - void merge(ColumnStatisticsImpl other) { - if (other instanceof TimestampStatisticsImpl) { - TimestampStatisticsImpl timestampStats = (TimestampStatisticsImpl) other; - if (minimum == null) { - minimum = timestampStats.minimum; - maximum = timestampStats.maximum; - } else if (timestampStats.minimum != null) { - if (minimum > timestampStats.minimum) { - minimum = timestampStats.minimum; - } - if (maximum < timestampStats.maximum) { - maximum = timestampStats.maximum; - } - } - } else { - if (isStatsExists() && minimum != null) { - throw new IllegalArgumentException("Incompatible merging of timestamp column statistics"); - } - } - super.merge(other); - } - - @Override - OrcProto.ColumnStatistics.Builder serialize() { - OrcProto.ColumnStatistics.Builder result = super.serialize(); - OrcProto.TimestampStatistics.Builder timestampStats = OrcProto.TimestampStatistics - .newBuilder(); - if (getNumberOfValues() != 0 && minimum != null) { - timestampStats.setMinimum(minimum); - timestampStats.setMaximum(maximum); - } - result.setTimestampStatistics(timestampStats); - return result; - } - - @Override - public Timestamp getMinimum() { - return minimum == null ? null : new Timestamp(minimum); - } - - @Override - public Timestamp getMaximum() { - return maximum == null ? null : new Timestamp(maximum); - } - - @Override - public String toString() { - StringBuilder buf = new StringBuilder(super.toString()); - if (getNumberOfValues() != 0) { - buf.append(" min: "); - buf.append(getMinimum()); - buf.append(" max: "); - buf.append(getMaximum()); - } - return buf.toString(); - } - } - - private long count = 0; - private boolean hasNull = false; - - ColumnStatisticsImpl(OrcProto.ColumnStatistics stats) { - if (stats.hasNumberOfValues()) { - count = stats.getNumberOfValues(); - } - - hasNull = !stats.hasHasNull() || stats.getHasNull(); - } - - ColumnStatisticsImpl() { - } - - void increment() { - count += 1; - } - - void setNull() { - hasNull = true; - } - - void updateBoolean(boolean value) { - throw new UnsupportedOperationException("Can't update boolean"); - } - - void updateInteger(long value) { - throw new UnsupportedOperationException("Can't update integer"); - } - - void updateDouble(double value) { - throw new UnsupportedOperationException("Can't update double"); - } - - void updateString(String value) { - throw new UnsupportedOperationException("Can't update string"); - } - - void updateBinary(Datum value) { - throw new UnsupportedOperationException("Can't update binary"); - } - - void updateDecimal(HiveDecimal value) { - throw new UnsupportedOperationException("Can't update decimal"); - } - - void updateDate(int days) { - throw new UnsupportedOperationException("Can't update date"); - } - - void updateTimestamp(Timestamp value) { - throw new UnsupportedOperationException("Can't update timestamp"); - } - - boolean isStatsExists() { - return (count > 0 || hasNull); - } - - void merge(ColumnStatisticsImpl stats) { - count += stats.count; - hasNull |= stats.hasNull; - } - - void reset() { - count = 0; - hasNull = false; - } - - @Override - public long getNumberOfValues() { - return count; - } - - @Override - public boolean hasNull() { - return hasNull; - } - - @Override - public String toString() { - return "count: " + count + " hasNull: " + hasNull; - } - - OrcProto.ColumnStatistics.Builder serialize() { - OrcProto.ColumnStatistics.Builder builder = - OrcProto.ColumnStatistics.newBuilder(); - builder.setNumberOfValues(count); - builder.setHasNull(hasNull); - return builder; - } - - static ColumnStatisticsImpl create(ObjectInspector inspector) { - switch (inspector.getCategory()) { - case PRIMITIVE: - switch (((PrimitiveObjectInspector) inspector).getPrimitiveCategory()) { - case BOOLEAN: - return new BooleanStatisticsImpl(); - case BYTE: - case SHORT: - case INT: - case LONG: - return new IntegerStatisticsImpl(); - case FLOAT: - case DOUBLE: - return new DoubleStatisticsImpl(); - case STRING: - case CHAR: - case VARCHAR: - return new StringStatisticsImpl(); - case DECIMAL: - return new DecimalStatisticsImpl(); - case DATE: - return new DateStatisticsImpl(); - case TIMESTAMP: - return new TimestampStatisticsImpl(); - case BINARY: - return new BinaryStatisticsImpl(); - default: - return new ColumnStatisticsImpl(); - } - default: - return new ColumnStatisticsImpl(); - } - } - - static ColumnStatisticsImpl deserialize(OrcProto.ColumnStatistics stats) { - if (stats.hasBucketStatistics()) { - return new BooleanStatisticsImpl(stats); - } else if (stats.hasIntStatistics()) { - return new IntegerStatisticsImpl(stats); - } else if (stats.hasDoubleStatistics()) { - return new DoubleStatisticsImpl(stats); - } else if (stats.hasStringStatistics()) { - return new StringStatisticsImpl(stats); - } else if (stats.hasDecimalStatistics()) { - return new DecimalStatisticsImpl(stats); - } else if (stats.hasDateStatistics()) { - return new DateStatisticsImpl(stats); - } else if (stats.hasTimestampStatistics()) { - return new TimestampStatisticsImpl(stats); - } else if(stats.hasBinaryStatistics()) { - return new BinaryStatisticsImpl(stats); - } else { - return new ColumnStatisticsImpl(stats); - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/CompressionCodec.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/CompressionCodec.java deleted file mode 100644 index 769ca50b21..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/CompressionCodec.java +++ /dev/null @@ -1,68 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.EnumSet; - -public interface CompressionCodec { - - public enum Modifier { - /* speed/compression tradeoffs */ - FASTEST, - FAST, - DEFAULT, - /* data sensitivity modifiers */ - TEXT, - BINARY - }; - - /** - * Compress the in buffer to the out buffer. - * @param in the bytes to compress - * @param out the uncompressed bytes - * @param overflow put any additional bytes here - * @return true if the output is smaller than input - * @throws IOException - */ - boolean compress(ByteBuffer in, ByteBuffer out, ByteBuffer overflow - ) throws IOException; - - /** - * Decompress the in buffer to the out buffer. - * @param in the bytes to decompress - * @param out the decompressed bytes - * @throws IOException - */ - void decompress(ByteBuffer in, ByteBuffer out) throws IOException; - - /** - * Produce a modified compression codec if the underlying algorithm allows - * modification. - * - * This does not modify the current object, but returns a new object if - * modifications are possible. Returns the same object if no modifications - * are possible. - * @param modifiers compression modifiers - * @return codec for use after optional modification - */ - CompressionCodec modify(@Nullable EnumSet modifiers); - -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/CompressionKind.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/CompressionKind.java deleted file mode 100644 index 8b16c6711f..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/CompressionKind.java +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.thirdparty.orc; - -/** - * An enumeration that lists the generic compression algorithms that - * can be applied to ORC files. - */ -public enum CompressionKind { - NONE, ZLIB, SNAPPY, LZO -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DateColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DateColumnStatistics.java deleted file mode 100644 index cb3405e8da..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DateColumnStatistics.java +++ /dev/null @@ -1,37 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import java.util.Date; - -/** - * Statistics for DATE columns. - */ -public interface DateColumnStatistics extends ColumnStatistics { - /** - * Get the minimum value for the column. - * @return minimum value - */ - Date getMinimum(); - - /** - * Get the maximum value for the column. - * @return maximum value - */ - Date getMaximum(); -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DecimalColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DecimalColumnStatistics.java deleted file mode 100644 index 27cdac2187..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DecimalColumnStatistics.java +++ /dev/null @@ -1,45 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import org.apache.hadoop.hive.common.type.HiveDecimal; - -/** - * Statistics for decimal columns. - */ -public interface DecimalColumnStatistics extends ColumnStatistics { - - /** - * Get the minimum value for the column. - * @return the minimum value - */ - HiveDecimal getMinimum(); - - /** - * Get the maximum value for the column. - * @return the maximum value - */ - HiveDecimal getMaximum(); - - /** - * Get the sum of the values of the column. - * @return the sum - */ - HiveDecimal getSum(); - -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DirectDecompressionCodec.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DirectDecompressionCodec.java deleted file mode 100644 index 53330523b5..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DirectDecompressionCodec.java +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import java.io.IOException; -import java.nio.ByteBuffer; - -public interface DirectDecompressionCodec extends CompressionCodec { - public boolean isAvailable(); - public void directDecompress(ByteBuffer in, ByteBuffer out) throws IOException; -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleColumnStatistics.java deleted file mode 100644 index ddce8f7078..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleColumnStatistics.java +++ /dev/null @@ -1,44 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -/** - * Statistics for float and double columns. - */ -public interface DoubleColumnStatistics extends ColumnStatistics { - - /** - * Get the smallest value in the column. Only defined if getNumberOfValues - * is non-zero. - * @return the minimum - */ - double getMinimum(); - - /** - * Get the largest value in the column. Only defined if getNumberOfValues - * is non-zero. - * @return the maximum - */ - double getMaximum(); - - /** - * Get the sum of the values in the column. - * @return the sum - */ - double getSum(); -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DynamicByteArray.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DynamicByteArray.java deleted file mode 100644 index 1d44f77dba..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DynamicByteArray.java +++ /dev/null @@ -1,303 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import org.apache.hadoop.io.Text; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.nio.ByteBuffer; - -/** - * A class that is a growable array of bytes. Growth is managed in terms of - * chunks that are allocated when needed. - */ -final class DynamicByteArray { - static final int DEFAULT_CHUNKSIZE = 32 * 1024; - static final int DEFAULT_NUM_CHUNKS = 128; - - private final int chunkSize; // our allocation sizes - private byte[][] data; // the real data - private int length; // max set element index +1 - private int initializedChunks = 0; // the number of chunks created - - public DynamicByteArray() { - this(DEFAULT_NUM_CHUNKS, DEFAULT_CHUNKSIZE); - } - - public DynamicByteArray(int numChunks, int chunkSize) { - if (chunkSize == 0) { - throw new IllegalArgumentException("bad chunksize"); - } - this.chunkSize = chunkSize; - data = new byte[numChunks][]; - } - - /** - * Ensure that the given index is valid. - */ - private void grow(int chunkIndex) { - if (chunkIndex >= initializedChunks) { - if (chunkIndex >= data.length) { - int newSize = Math.max(chunkIndex + 1, 2 * data.length); - byte[][] newChunk = new byte[newSize][]; - System.arraycopy(data, 0, newChunk, 0, data.length); - data = newChunk; - } - for(int i=initializedChunks; i <= chunkIndex; ++i) { - data[i] = new byte[chunkSize]; - } - initializedChunks = chunkIndex + 1; - } - } - - public byte get(int index) { - if (index >= length) { - throw new IndexOutOfBoundsException("Index " + index + - " is outside of 0.." + - (length - 1)); - } - int i = index / chunkSize; - int j = index % chunkSize; - return data[i][j]; - } - - public void set(int index, byte value) { - int i = index / chunkSize; - int j = index % chunkSize; - grow(i); - if (index >= length) { - length = index + 1; - } - data[i][j] = value; - } - - public int add(byte value) { - int i = length / chunkSize; - int j = length % chunkSize; - grow(i); - data[i][j] = value; - int result = length; - length += 1; - return result; - } - - /** - * Copy a slice of a byte array into our buffer. - * @param value the array to copy from - * @param valueOffset the first location to copy from value - * @param valueLength the number of bytes to copy from value - * @return the offset of the start of the value - */ - public int add(byte[] value, int valueOffset, int valueLength) { - int i = length / chunkSize; - int j = length % chunkSize; - grow((length + valueLength) / chunkSize); - int remaining = valueLength; - while (remaining > 0) { - int size = Math.min(remaining, chunkSize - j); - System.arraycopy(value, valueOffset, data[i], j, size); - remaining -= size; - valueOffset += size; - i += 1; - j = 0; - } - int result = length; - length += valueLength; - return result; - } - - /** - * Read the entire stream into this array. - * @param in the stream to read from - * @throws IOException - */ - public void readAll(InputStream in) throws IOException { - int currentChunk = length / chunkSize; - int currentOffset = length % chunkSize; - grow(currentChunk); - int currentLength = in.read(data[currentChunk], currentOffset, - chunkSize - currentOffset); - while (currentLength > 0) { - length += currentLength; - currentOffset = length % chunkSize; - if (currentOffset == 0) { - currentChunk = length / chunkSize; - grow(currentChunk); - } - currentLength = in.read(data[currentChunk], currentOffset, - chunkSize - currentOffset); - } - } - - /** - * Byte compare a set of bytes against the bytes in this dynamic array. - * @param other source of the other bytes - * @param otherOffset start offset in the other array - * @param otherLength number of bytes in the other array - * @param ourOffset the offset in our array - * @param ourLength the number of bytes in our array - * @return negative for less, 0 for equal, positive for greater - */ - public int compare(byte[] other, int otherOffset, int otherLength, - int ourOffset, int ourLength) { - int currentChunk = ourOffset / chunkSize; - int currentOffset = ourOffset % chunkSize; - int maxLength = Math.min(otherLength, ourLength); - while (maxLength > 0 && - other[otherOffset] == data[currentChunk][currentOffset]) { - otherOffset += 1; - currentOffset += 1; - if (currentOffset == chunkSize) { - currentChunk += 1; - currentOffset = 0; - } - maxLength -= 1; - } - if (maxLength == 0) { - return otherLength - ourLength; - } - int otherByte = 0xff & other[otherOffset]; - int ourByte = 0xff & data[currentChunk][currentOffset]; - return otherByte > ourByte ? 1 : -1; - } - - /** - * Get the size of the array. - * @return the number of bytes in the array - */ - public int size() { - return length; - } - - /** - * Clear the array to its original pristine state. - */ - public void clear() { - length = 0; - for(int i=0; i < data.length; ++i) { - data[i] = null; - } - initializedChunks = 0; - } - - /** - * Set a text value from the bytes in this dynamic array. - * @param result the value to set - * @param offset the start of the bytes to copy - * @param length the number of bytes to copy - */ - public void setText(Text result, int offset, int length) { - result.clear(); - int currentChunk = offset / chunkSize; - int currentOffset = offset % chunkSize; - int currentLength = Math.min(length, chunkSize - currentOffset); - while (length > 0) { - result.append(data[currentChunk], currentOffset, currentLength); - length -= currentLength; - currentChunk += 1; - currentOffset = 0; - currentLength = Math.min(length, chunkSize - currentOffset); - } - } - - /** - * Write out a range of this dynamic array to an output stream. - * @param out the stream to write to - * @param offset the first offset to write - * @param length the number of bytes to write - * @throws IOException - */ - public void write(OutputStream out, int offset, - int length) throws IOException { - int currentChunk = offset / chunkSize; - int currentOffset = offset % chunkSize; - while (length > 0) { - int currentLength = Math.min(length, chunkSize - currentOffset); - out.write(data[currentChunk], currentOffset, currentLength); - length -= currentLength; - currentChunk += 1; - currentOffset = 0; - } - } - - @Override - public String toString() { - int i; - StringBuilder sb = new StringBuilder(length * 3); - - sb.append('{'); - int l = length - 1; - for (i=0; i 0) { - result.put(data[currentChunk], currentOffset, currentLength); - length -= currentLength; - currentChunk += 1; - currentOffset = 0; - currentLength = Math.min(length, chunkSize - currentOffset); - } - } - - /** - * Gets all the bytes of the array. - * - * @return Bytes of the array - */ - public byte[] get() { - byte[] result = null; - if (length > 0) { - int currentChunk = 0; - int currentOffset = 0; - int currentLength = Math.min(length, chunkSize); - int destOffset = 0; - result = new byte[length]; - int totalLength = length; - while (totalLength > 0) { - System.arraycopy(data[currentChunk], currentOffset, result, destOffset, currentLength); - destOffset += currentLength; - totalLength -= currentLength; - currentChunk += 1; - currentOffset = 0; - currentLength = Math.min(totalLength, chunkSize - currentOffset); - } - } - return result; - } - - /** - * Get the size of the buffers. - */ - public long getSizeInBytes() { - return initializedChunks * chunkSize; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DynamicIntArray.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DynamicIntArray.java deleted file mode 100644 index a34770663d..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DynamicIntArray.java +++ /dev/null @@ -1,142 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -/** - * Dynamic int array that uses primitive types and chunks to avoid copying - * large number of integers when it resizes. - * - * The motivation for this class is memory optimization, i.e. space efficient - * storage of potentially huge arrays without good a-priori size guesses. - * - * The API of this class is between a primitive array and a AbstractList. It's - * not a Collection implementation because it handles primitive types, but the - * API could be extended to support iterators and the like. - * - * NOTE: Like standard Collection implementations/arrays, this class is not - * synchronized. - */ -final class DynamicIntArray { - static final int DEFAULT_CHUNKSIZE = 8 * 1024; - static final int INIT_CHUNKS = 128; - - private final int chunkSize; // our allocation size - private int[][] data; // the real data - private int length; // max set element index +1 - private int initializedChunks = 0; // the number of created chunks - - public DynamicIntArray() { - this(DEFAULT_CHUNKSIZE); - } - - public DynamicIntArray(int chunkSize) { - this.chunkSize = chunkSize; - - data = new int[INIT_CHUNKS][]; - } - - /** - * Ensure that the given index is valid. - */ - private void grow(int chunkIndex) { - if (chunkIndex >= initializedChunks) { - if (chunkIndex >= data.length) { - int newSize = Math.max(chunkIndex + 1, 2 * data.length); - int[][] newChunk = new int[newSize][]; - System.arraycopy(data, 0, newChunk, 0, data.length); - data = newChunk; - } - for (int i=initializedChunks; i <= chunkIndex; ++i) { - data[i] = new int[chunkSize]; - } - initializedChunks = chunkIndex + 1; - } - } - - public int get(int index) { - if (index >= length) { - throw new IndexOutOfBoundsException("Index " + index + - " is outside of 0.." + - (length - 1)); - } - int i = index / chunkSize; - int j = index % chunkSize; - return data[i][j]; - } - - public void set(int index, int value) { - int i = index / chunkSize; - int j = index % chunkSize; - grow(i); - if (index >= length) { - length = index + 1; - } - data[i][j] = value; - } - - public void increment(int index, int value) { - int i = index / chunkSize; - int j = index % chunkSize; - grow(i); - if (index >= length) { - length = index + 1; - } - data[i][j] += value; - } - - public void add(int value) { - int i = length / chunkSize; - int j = length % chunkSize; - grow(i); - data[i][j] = value; - length += 1; - } - - public int size() { - return length; - } - - public void clear() { - length = 0; - for(int i=0; i < data.length; ++i) { - data[i] = null; - } - initializedChunks = 0; - } - - public String toString() { - int i; - StringBuilder sb = new StringBuilder(length * 4); - - sb.append('{'); - int l = length - 1; - for (i=0; i= 0, "size is negative"); - - this.maxMergeDistance = checkNotNull(maxMergeDistance, "maxMergeDistance is null"); - this.maxReadSize = checkNotNull(maxReadSize, "maxMergeDistance is null"); - } - - @Override - public void close() - throws IOException - { - inputStream.close(); - } - - @Override - public long getReadTimeNanos() - { - return readTimeNanos; - } - - @Override - public long getSize() - { - return size; - } - - @Override - public void readFully(long position, byte[] buffer) - throws IOException - { - readFully(position, buffer, 0, buffer.length); - } - - @Override - public void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) - throws IOException - { - long start = System.nanoTime(); - - inputStream.readFully(position, buffer, bufferOffset, bufferLength); - readTimeNanos += System.nanoTime() - start; - } - - @Override - public Map readFully(Map diskRanges) - throws IOException - { - checkNotNull(diskRanges, "diskRanges is null"); - - if (diskRanges.isEmpty()) { - return ImmutableMap.of(); - } - - Iterable mergedRanges = mergeAdjacentDiskRanges(diskRanges.values(), maxMergeDistance, maxReadSize); - - // read ranges - Map buffers = new LinkedHashMap<>(); - for (DiskRange mergedRange : mergedRanges) { - // read full range in one request - byte[] buffer = new byte[mergedRange.getLength()]; - readFully(mergedRange.getOffset(), buffer); - buffers.put(mergedRange, buffer); - } - - ImmutableMap.Builder slices = ImmutableMap.builder(); - diskRanges.forEach((K key, DiskRange range) -> - slices.put(key, new BasicSliceInput(getDiskRangeSlice(range, buffers)))); - - return slices.build(); - } - - @Override - public String toString() - { - return path; - } -} - - diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/IntegerColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/IntegerColumnStatistics.java deleted file mode 100644 index 208454f139..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/IntegerColumnStatistics.java +++ /dev/null @@ -1,50 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -/** - * Statistics for all of the integer columns, such as byte, short, int, and - * long. - */ -public interface IntegerColumnStatistics extends ColumnStatistics { - /** - * Get the smallest value in the column. Only defined if getNumberOfValues - * is non-zero. - * @return the minimum - */ - long getMinimum(); - - /** - * Get the largest value in the column. Only defined if getNumberOfValues - * is non-zero. - * @return the maximum - */ - long getMaximum(); - - /** - * Is the sum defined? If the sum overflowed the counter this will be false. - * @return is the sum available - */ - boolean isSumDefined(); - - /** - * Get the sum of the column. Only valid if isSumDefined returns true. - * @return the sum of the column - */ - long getSum(); -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/IntegerWriter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/IntegerWriter.java deleted file mode 100644 index 6872882792..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/IntegerWriter.java +++ /dev/null @@ -1,47 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.thirdparty.orc; - -import java.io.IOException; - -/** - * Interface for writing integers. - */ -interface IntegerWriter { - - /** - * Get position from the stream. - * @param recorder - * @throws IOException - */ - void getPosition(PositionRecorder recorder) throws IOException; - - /** - * Write the integer value - * @param value - * @throws IOException - */ - void write(long value) throws IOException; - - /** - * Flush the buffer - * @throws IOException - */ - void flush() throws IOException; -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MemoryManager.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MemoryManager.java deleted file mode 100644 index 79af80fbb6..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MemoryManager.java +++ /dev/null @@ -1,212 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.base.Preconditions; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; - -import java.io.IOException; -import java.lang.management.ManagementFactory; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.locks.ReentrantLock; - -/** - * Implements a memory manager that keeps a global context of how many ORC - * writers there are and manages the memory between them. For use cases with - * dynamic partitions, it is easy to end up with many writers in the same task. - * By managing the size of each allocation, we try to cut down the size of each - * allocation and keep the task from running out of memory. - * - * This class is not thread safe, but is re-entrant - ensure creation and all - * invocations are triggered from the same thread. - */ -class MemoryManager { - - private static final Log LOG = LogFactory.getLog(MemoryManager.class); - - /** - * How often should we check the memory sizes? Measured in rows added - * to all of the writers. - */ - private static final int ROWS_BETWEEN_CHECKS = 5000; - private final long totalMemoryPool; - private final Map writerList = - new HashMap<>(); - private long totalAllocation = 0; - private double currentScale = 1; - private int rowsAddedSinceCheck = 0; - private final OwnedLock ownerLock = new OwnedLock(); - - @SuppressWarnings("serial") - private static class OwnedLock extends ReentrantLock { - public Thread getOwner() { - return super.getOwner(); - } - } - - private static class WriterInfo { - long allocation; - Callback callback; - WriterInfo(long allocation, Callback callback) { - this.allocation = allocation; - this.callback = callback; - } - } - - public interface Callback { - /** - * The writer needs to check its memory usage - * @param newScale the current scale factor for memory allocations - * @return true if the writer was over the limit - * @throws IOException - */ - boolean checkMemory(double newScale) throws IOException; - } - - /** - * Create the memory manager. - * @param conf use the configuration to find the maximum size of the memory - * pool. - */ - MemoryManager(Configuration conf) { - HiveConf.ConfVars poolVar = HiveConf.ConfVars.HIVE_ORC_FILE_MEMORY_POOL; - double maxLoad = conf.getFloat(poolVar.varname, poolVar.defaultFloatVal); - totalMemoryPool = Math.round(ManagementFactory.getMemoryMXBean(). - getHeapMemoryUsage().getMax() * maxLoad); - ownerLock.lock(); - } - - /** - * Light weight thread-safety check for multi-threaded access patterns - */ - private void checkOwner() { - Preconditions.checkArgument(ownerLock.isHeldByCurrentThread(), - "Owner thread expected %s, got %s", - ownerLock.getOwner(), - Thread.currentThread()); - } - - /** - * Add a new writer's memory allocation to the pool. We use the path - * as a unique key to ensure that we don't get duplicates. - * @param path the file that is being written - * @param requestedAllocation the requested buffer size - */ - void addWriter(Path path, long requestedAllocation, - Callback callback) throws IOException { - checkOwner(); - WriterInfo oldVal = writerList.get(path); - // this should always be null, but we handle the case where the memory - // manager wasn't told that a writer wasn't still in use and the task - // starts writing to the same path. - if (oldVal == null) { - oldVal = new WriterInfo(requestedAllocation, callback); - writerList.put(path, oldVal); - totalAllocation += requestedAllocation; - } else { - // handle a new writer that is writing to the same path - totalAllocation += requestedAllocation - oldVal.allocation; - oldVal.allocation = requestedAllocation; - oldVal.callback = callback; - } - updateScale(true); - } - - /** - * Remove the given writer from the pool. - * @param path the file that has been closed - */ - void removeWriter(Path path) throws IOException { - checkOwner(); - WriterInfo val = writerList.get(path); - if (val != null) { - writerList.remove(path); - totalAllocation -= val.allocation; - if (writerList.isEmpty()) { - rowsAddedSinceCheck = 0; - } - updateScale(false); - } - if(writerList.isEmpty()) { - rowsAddedSinceCheck = 0; - } - } - - /** - * Get the total pool size that is available for ORC writers. - * @return the number of bytes in the pool - */ - long getTotalMemoryPool() { - return totalMemoryPool; - } - - /** - * The scaling factor for each allocation to ensure that the pool isn't - * oversubscribed. - * @return a fraction between 0.0 and 1.0 of the requested size that is - * available for each writer. - */ - double getAllocationScale() { - return currentScale; - } - - /** - * Give the memory manager an opportunity for doing a memory check. - * @throws IOException - */ - void addedRow() throws IOException { - if (++rowsAddedSinceCheck >= ROWS_BETWEEN_CHECKS) { - notifyWriters(); - } - } - - /** - * Notify all of the writers that they should check their memory usage. - * @throws IOException - */ - void notifyWriters() throws IOException { - checkOwner(); - LOG.debug("Notifying writers after " + rowsAddedSinceCheck); - for(WriterInfo writer: writerList.values()) { - boolean flushed = writer.callback.checkMemory(currentScale); - if (LOG.isDebugEnabled() && flushed) { - LOG.debug("flushed " + writer.toString()); - } - } - rowsAddedSinceCheck = 0; - } - - /** - * Update the currentScale based on the current allocation and pool size. - * This also updates the notificationTrigger. - * @param isAllocate is this an allocation? - */ - private void updateScale(boolean isAllocate) throws IOException { - if (totalAllocation <= totalMemoryPool) { - currentScale = 1; - } else { - currentScale = (double) totalMemoryPool / totalAllocation; - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Metadata.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Metadata.java deleted file mode 100644 index dfa4c36d1b..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Metadata.java +++ /dev/null @@ -1,45 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.collect.Lists; - -import java.util.List; - -public class Metadata { - - private final OrcProto.Metadata metadata; - - Metadata(OrcProto.Metadata m) { - this.metadata = m; - } - - /** - * Return list of stripe level column statistics - * - * @return list of stripe statistics - */ - public List getStripeStatistics() { - List result = Lists.newArrayList(); - for (OrcProto.StripeStatistics ss : metadata.getStripeStatsList()) { - result.add(new StripeStatistics(ss.getColStatsList())); - } - return result; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MetadataReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MetadataReader.java deleted file mode 100644 index a3685a7240..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MetadataReader.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.collect.Lists; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.common.io.DiskRange; -import org.apache.orc.CompressionCodec; -import org.apache.orc.OrcProto; -import org.apache.orc.StripeInformation; -import org.apache.orc.impl.BufferChunk; -import org.apache.orc.impl.InStream; -import org.apache.orc.impl.OrcIndex; - -import java.io.Closeable; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.List; - -public class MetadataReader implements Closeable { - - private final FSDataInputStream file; - private final CompressionCodec codec; - private final int bufferSize; - private final int typeCount; - - public MetadataReader(FileSystem fileSystem, Path path, - CompressionCodec codec, int bufferSize, int typeCount) throws IOException { - this(fileSystem.open(path), codec, bufferSize, typeCount); - } - - public MetadataReader(FSDataInputStream file, - CompressionCodec codec, int bufferSize, int typeCount) { - this.file = file; - this.codec = codec; - this.bufferSize = bufferSize; - this.typeCount = typeCount; - } - - public OrcIndex readRowIndex(OrcProto.StripeInformation stripe, - OrcProto.StripeFooter footer, boolean[] included, OrcProto.RowIndex[] indexes, - boolean[] sargColumns, OrcProto.BloomFilterIndex[] bloomFilterIndices) throws IOException { - if (footer == null) { - footer = readStripeFooter(stripe); - } - if (indexes == null) { - indexes = new OrcProto.RowIndex[typeCount]; - } - if (bloomFilterIndices == null) { - bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount]; - } - long offset = stripe.getOffset(); - List streams = footer.getStreamsList(); - for (int i = 0; i < streams.size(); i++) { - OrcProto.Stream stream = streams.get(i); - OrcProto.Stream nextStream = null; - if (i < streams.size() - 1) { - nextStream = streams.get(i+1); - } - int col = stream.getColumn(); - int len = (int) stream.getLength(); - // row index stream and bloom filter are interlaced, check if the sarg column contains bloom - // filter and combine the io to read row index and bloom filters for that column together - if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.ROW_INDEX)) { - boolean readBloomFilter = false; - if (sargColumns != null && sargColumns[col] && - nextStream.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER) { - len += nextStream.getLength(); - i += 1; - readBloomFilter = true; - } - if ((included == null || included[col]) && indexes[col] == null) { - byte[] buffer = new byte[len]; - file.readFully(offset, buffer, 0, buffer.length); - ByteBuffer bb = ByteBuffer.wrap(buffer); - indexes[col] = OrcProto.RowIndex.parseFrom(InStream.create("index", - Lists.newArrayList(new BufferChunk(bb, 0)), stream.getLength(), - codec, bufferSize)); - if (readBloomFilter) { - bb.position((int) stream.getLength()); - bloomFilterIndices[col] = OrcProto.BloomFilterIndex.parseFrom(InStream.create( - "bloom_filter", Lists.newArrayList(new BufferChunk(bb, 0)), - nextStream.getLength(), codec, bufferSize)); - } - } - } - offset += len; - } - - OrcIndex index = new OrcIndex(indexes, bloomFilterIndices); - return index; - } - - public OrcProto.StripeFooter readStripeFooter(OrcProto.StripeInformation stripe) throws IOException { - long offset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength(); - int tailLength = (int) stripe.getFooterLength(); - - // read the footer - ByteBuffer tailBuf = ByteBuffer.allocate(tailLength); - file.readFully(offset, tailBuf.array(), tailBuf.arrayOffset(), tailLength); - return OrcProto.StripeFooter.parseFrom(InStream.createCodedInputStream("footer", - Lists.newArrayList(new BufferChunk(tailBuf, 0)), - tailLength, codec, bufferSize)); - } - - @Override - public void close() throws IOException { - file.close(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java index b3d9d30795..8f26d212b1 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java @@ -21,11 +21,15 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; - -import static org.apache.tajo.storage.thirdparty.orc.OrcConf.ConfVars.*; +import org.apache.orc.CompressionKind; +import org.apache.orc.FileMetaInfo; +import org.apache.orc.FileMetadata; +import org.apache.orc.TypeDescription; +import org.apache.orc.impl.MemoryManager; +import org.apache.tajo.storage.orc.ORCAppender; import java.io.IOException; +import java.util.Properties; import java.util.TimeZone; /** @@ -50,7 +54,7 @@ public final class OrcFile { * prevent the new reader from reading ORC files generated by any released * version of Hive. */ - public static enum Version { + public enum Version { V_0_11("0.11", 0, 11), V_0_12("0.12", 0, 12); @@ -102,9 +106,14 @@ public int getMinor() { * For bugs in the writer, but the old readers already read the new data * correctly, bump this version instead of the Version. */ - public static enum WriterVersion { + public enum WriterVersion { ORIGINAL(0), - HIVE_8732(1); // corrupted stripe/file maximum column statistics + HIVE_8732(1), // corrupted stripe/file maximum column statistics + HIVE_4243(2), // use real column names from Hive tables + HIVE_12055(3), // vectorized writer + + // Don't use any magic numbers here except for the below: + FUTURE(Integer.MAX_VALUE); // a version from a future writer private final int id; @@ -112,67 +121,111 @@ public int getId() { return id; } - private WriterVersion(int id) { + WriterVersion(int id) { this.id = id; } + + private static final WriterVersion[] values; + static { + // Assumes few non-negative values close to zero. + int max = Integer.MIN_VALUE; + for (WriterVersion v : WriterVersion.values()) { + if (v.id < 0) throw new AssertionError(); + if (v.id > max && FUTURE.id != v.id) { + max = v.id; + } + } + values = new WriterVersion[max + 1]; + for (WriterVersion v : WriterVersion.values()) { + if (v.id < values.length) { + values[v.id] = v; + } + } + } + + public static WriterVersion from(int val) { + if (val == FUTURE.id) return FUTURE; // Special handling for the magic value. + return values[val]; + } } + public static final WriterVersion CURRENT_WRITER = WriterVersion.HIVE_12055; - public static enum EncodingStrategy { + public enum EncodingStrategy { SPEED, COMPRESSION; } - public static enum CompressionStrategy { + public enum CompressionStrategy { SPEED, COMPRESSION; } - // Note : these string definitions for table properties are deprecated, - // and retained only for backward compatibility, please do not add to - // them, add to OrcTableProperties below instead - @Deprecated public static final String COMPRESSION = "orc.compress"; - @Deprecated public static final String COMPRESSION_BLOCK_SIZE = "orc.compress.size"; - @Deprecated public static final String STRIPE_SIZE = "orc.stripe.size"; - @Deprecated public static final String ROW_INDEX_STRIDE = "orc.row.index.stride"; - @Deprecated public static final String ENABLE_INDEXES = "orc.create.index"; - @Deprecated public static final String BLOCK_PADDING = "orc.block.padding"; + // unused + private OrcFile() {} - /** - * Enum container for all orc table properties. - * If introducing a new orc-specific table property, - * add it here. - */ - public static enum OrcTableProperties { - COMPRESSION("orc.compress"), - COMPRESSION_BLOCK_SIZE("orc.compress.size"), - STRIPE_SIZE("orc.stripe.size"), - BLOCK_SIZE("orc.block.size"), - ROW_INDEX_STRIDE("orc.row.index.stride"), - ENABLE_INDEXES("orc.create.index"), - BLOCK_PADDING("orc.block.padding"), - ENCODING_STRATEGY("orc.encoding.strategy"), - BLOOM_FILTER_COLUMNS("orc.bloom.filter.columns"), - BLOOM_FILTER_FPP("orc.bloom.filter.fpp"); + public static class ReaderOptions { + private final Configuration conf; + private FileSystem filesystem; + private FileMetaInfo fileMetaInfo; // TODO: this comes from some place. + private long maxLength = Long.MAX_VALUE; + private FileMetadata fullFileMetadata; // Propagate from LLAP cache. + + public ReaderOptions(Configuration conf) { + this.conf = conf; + } + + public ReaderOptions fileMetaInfo(FileMetaInfo info) { + fileMetaInfo = info; + return this; + } + + public ReaderOptions filesystem(FileSystem fs) { + this.filesystem = fs; + return this; + } + + public ReaderOptions maxLength(long val) { + maxLength = val; + return this; + } - private final String propName; + public ReaderOptions fileMetadata(FileMetadata metadata) { + this.fullFileMetadata = metadata; + return this; + } + + public Configuration getConfiguration() { + return conf; + } - OrcTableProperties(String propName) { - this.propName = propName; + public FileSystem getFilesystem() { + return filesystem; } - public String getPropName(){ - return this.propName; + public FileMetaInfo getFileMetaInfo() { + return fileMetaInfo; + } + + public long getMaxLength() { + return maxLength; + } + + public FileMetadata getFileMetadata() { + return fullFileMetadata; } } - // unused - private OrcFile() {} + public static ReaderOptions readerOptions(Configuration conf) { + return new ReaderOptions(conf); + } + + - public static interface WriterContext { + public interface WriterContext { Writer getWriter(); } - public static interface WriterCallback { - public void preStripeWrite(WriterContext context) throws IOException; - public void preFooterWrite(WriterContext context) throws IOException; + public interface WriterCallback { + void preStripeWrite(WriterContext context) throws IOException; + void preFooterWrite(WriterContext context) throws IOException; } /** @@ -181,7 +234,7 @@ public static interface WriterCallback { public static class WriterOptions { private final Configuration configuration; private FileSystem fileSystemValue = null; - private ObjectInspector inspectorValue = null; + private TypeDescription schema = null; private long stripeSizeValue; private long blockSizeValue; private int rowIndexStrideValue; @@ -193,45 +246,42 @@ public static class WriterOptions { private WriterCallback callback; private EncodingStrategy encodingStrategy; private CompressionStrategy compressionStrategy; - private float paddingTolerance; + private double paddingTolerance; private String bloomFilterColumns; private double bloomFilterFpp; - private TimeZone timezone; - WriterOptions(Configuration conf) { + protected WriterOptions(Properties tableProperties, Configuration conf) { configuration = conf; - memoryManagerValue = getMemoryManager(conf); - stripeSizeValue = OrcConf.getLongVar(conf, HIVE_ORC_DEFAULT_STRIPE_SIZE); - blockSizeValue = OrcConf.getLongVar(conf, HIVE_ORC_DEFAULT_BLOCK_SIZE); - rowIndexStrideValue = OrcConf.getIntVar(conf, HIVE_ORC_DEFAULT_ROW_INDEX_STRIDE); - bufferSizeValue = OrcConf.getIntVar(conf, HIVE_ORC_DEFAULT_BUFFER_SIZE); - blockPaddingValue = OrcConf.getBoolVar(conf, HIVE_ORC_DEFAULT_BLOCK_PADDING); - compressValue = CompressionKind.valueOf(OrcConf.getVar(conf, HIVE_ORC_DEFAULT_COMPRESS)); - String versionName = OrcConf.getVar(conf, HIVE_ORC_WRITE_FORMAT); - if (versionName == null) { - versionValue = Version.CURRENT; - } else { - versionValue = Version.byName(versionName); - } - String enString = - conf.get(OrcConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname); - if (enString == null) { - encodingStrategy = EncodingStrategy.SPEED; - } else { - encodingStrategy = EncodingStrategy.valueOf(enString); - } - - String compString = conf - .get(OrcConf.ConfVars.HIVE_ORC_COMPRESSION_STRATEGY.varname); - if (compString == null) { - compressionStrategy = CompressionStrategy.SPEED; - } else { - compressionStrategy = CompressionStrategy.valueOf(compString); - } - - paddingTolerance = conf.getFloat(OrcConf.ConfVars.HIVE_ORC_BLOCK_PADDING_TOLERANCE.varname, - OrcConf.ConfVars.HIVE_ORC_BLOCK_PADDING_TOLERANCE.defaultFloatVal); - bloomFilterFpp = BloomFilterIO.DEFAULT_FPP; + memoryManagerValue = getStaticMemoryManager(conf); + stripeSizeValue = org.apache.orc.OrcConf.STRIPE_SIZE.getLong(tableProperties, conf); + blockSizeValue = org.apache.orc.OrcConf.BLOCK_SIZE.getLong(tableProperties, conf); + rowIndexStrideValue = + (int) org.apache.orc.OrcConf.ROW_INDEX_STRIDE.getLong(tableProperties, conf); + bufferSizeValue = (int) org.apache.orc.OrcConf.BUFFER_SIZE.getLong(tableProperties, + conf); + blockPaddingValue = + org.apache.orc.OrcConf.BLOCK_PADDING.getBoolean(tableProperties, conf); + compressValue = + CompressionKind.valueOf(org.apache.orc.OrcConf.COMPRESS.getString(tableProperties, + conf)); + String versionName = org.apache.orc.OrcConf.WRITE_FORMAT.getString(tableProperties, + conf); + versionValue = Version.byName(versionName); + String enString = org.apache.orc.OrcConf.ENCODING_STRATEGY.getString(tableProperties, + conf); + encodingStrategy = EncodingStrategy.valueOf(enString); + + String compString = + org.apache.orc.OrcConf.COMPRESSION_STRATEGY.getString(tableProperties, conf); + compressionStrategy = CompressionStrategy.valueOf(compString); + + paddingTolerance = + org.apache.orc.OrcConf.BLOCK_PADDING_TOLERANCE.getDouble(tableProperties, conf); + + bloomFilterColumns = org.apache.orc.OrcConf.BLOOM_FILTER_COLUMNS.getString(tableProperties, + conf); + bloomFilterFpp = org.apache.orc.OrcConf.BLOOM_FILTER_FPP.getDouble(tableProperties, + conf); } /** @@ -302,7 +352,7 @@ public WriterOptions encodingStrategy(EncodingStrategy strategy) { /** * Sets the tolerance for block padding as a percentage of stripe size. */ - public WriterOptions paddingTolerance(float value) { + public WriterOptions paddingTolerance(double value) { paddingTolerance = value; return this; } @@ -318,7 +368,7 @@ public WriterOptions bloomFilterColumns(String columns) { /** * Specify the false positive probability for bloom filter. * @param fpp - false positive probability - * @return + * @return this */ public WriterOptions bloomFilterFpp(double fpp) { bloomFilterFpp = fpp; @@ -334,11 +384,12 @@ public WriterOptions compress(CompressionKind value) { } /** - * A required option that sets the object inspector for the rows. Used - * to determine the schema for the file. + * Set the schema for the file. This is a required parameter. + * @param schema the schema for the file. + * @return this */ - public WriterOptions inspector(ObjectInspector value) { - inspectorValue = value; + public WriterOptions setSchema(TypeDescription schema) { + this.schema = schema; return this; } @@ -353,7 +404,7 @@ public WriterOptions version(Version value) { /** * Add a listener for when the stripe and file are about to be closed. * @param callback the object to be called when the stripe is closed - * @return + * @return this */ public WriterOptions callback(WriterCallback callback) { this.callback = callback; @@ -363,25 +414,112 @@ public WriterOptions callback(WriterCallback callback) { /** * A package local option to set the memory manager. */ - WriterOptions memory(MemoryManager value) { + protected WriterOptions memory(MemoryManager value) { memoryManagerValue = value; return this; } - /** - * Tajo-specific - */ - WriterOptions timezone(TimeZone value) { - timezone = value; - return this; + public boolean getBlockPadding() { + return blockPaddingValue; + } + + public long getBlockSize() { + return blockSizeValue; + } + + public String getBloomFilterColumns() { + return bloomFilterColumns; } + + public FileSystem getFileSystem() { + return fileSystemValue; + } + + public Configuration getConfiguration() { + return configuration; + } + + public TypeDescription getSchema() { + return schema; + } + + public long getStripeSize() { + return stripeSizeValue; + } + + public CompressionKind getCompress() { + return compressValue; + } + + public WriterCallback getCallback() { + return callback; + } + + public Version getVersion() { + return versionValue; + } + + public MemoryManager getMemoryManager() { + return memoryManagerValue; + } + + public int getBufferSize() { + return bufferSizeValue; + } + + public int getRowIndexStride() { + return rowIndexStrideValue; + } + + public CompressionStrategy getCompressionStrategy() { + return compressionStrategy; + } + + public EncodingStrategy getEncodingStrategy() { + return encodingStrategy; + } + + public double getPaddingTolerance() { + return paddingTolerance; + } + + public double getBloomFilterFpp() { + return bloomFilterFpp; + } + } + + /** + * Create a set of writer options based on a configuration. + * @param conf the configuration to use for values + * @return A WriterOptions object that can be modified + */ + public static ORCAppender.WriterOptions writerOptions(Configuration conf) { + return new ORCAppender.WriterOptions(null, conf); } /** - * Create a default set of write options that can be modified. + * Create a set of write options based on a set of table properties and + * configuration. + * @param tableProperties the properties of the table + * @param conf the configuration of the query + * @return a WriterOptions object that can be modified */ - public static WriterOptions writerOptions(Configuration conf) { - return new WriterOptions(conf); + public static WriterOptions writerOptions(Properties tableProperties, + Configuration conf) { + return new WriterOptions(tableProperties, conf); + } + + private static synchronized MemoryManager getStaticMemoryManager( + final Configuration conf) { + if (memoryManager == null) { + memoryManager = new ThreadLocal() { + @Override + protected MemoryManager initialValue() { + return new MemoryManager(conf); + } + }; + } + return memoryManager.get(); } /** @@ -393,54 +531,13 @@ public static WriterOptions writerOptions(Configuration conf) { * @throws IOException */ public static Writer createWriter(Path path, - WriterOptions opts - ) throws IOException { - FileSystem fs = opts.fileSystemValue == null ? - path.getFileSystem(opts.configuration) : opts.fileSystemValue; - - return new WriterImpl(fs, path, opts.configuration, opts.inspectorValue, - opts.stripeSizeValue, opts.compressValue, - opts.bufferSizeValue, opts.rowIndexStrideValue, - opts.memoryManagerValue, opts.blockPaddingValue, - opts.versionValue, opts.callback, - opts.encodingStrategy, opts.compressionStrategy, - opts.paddingTolerance, opts.blockSizeValue, - opts.bloomFilterColumns, opts.bloomFilterFpp, - opts.timezone); - } + WriterOptions opts, + TimeZone timeZone + ) throws IOException { + FileSystem fs = opts.getFileSystem() == null ? + path.getFileSystem(opts.getConfiguration()) : opts.getFileSystem(); - /** - * Create an ORC file writer. This method is provided for API backward - * compatability with Hive 0.11. - * @param fs file system - * @param path filename to write to - * @param inspector the ObjectInspector that inspects the rows - * @param stripeSize the number of bytes in a stripe - * @param compress how to compress the file - * @param bufferSize the number of bytes to compress at once - * @param rowIndexStride the number of rows between row index entries or - * 0 to suppress all indexes - * @return a new ORC file writer - * @throws IOException - */ - public static Writer createWriter(FileSystem fs, - Path path, - Configuration conf, - ObjectInspector inspector, - long stripeSize, - CompressionKind compress, - int bufferSize, - int rowIndexStride, - TimeZone timeZone) throws IOException { - return createWriter(path, - writerOptions(conf) - .fileSystem(fs) - .inspector(inspector) - .stripeSize(stripeSize) - .compress(compress) - .bufferSize(bufferSize) - .rowIndexStride(rowIndexStride) - .timezone(timeZone)); + return new WriterImpl(fs, path, opts, timeZone); } private static ThreadLocal memoryManager = null; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java index 18a602bd34..c018c802d8 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java @@ -25,8 +25,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.io.DiskRange; import org.apache.hadoop.hive.common.io.DiskRangeList; -import org.apache.orc.CompressionCodec; -import org.apache.orc.DataReader; +import org.apache.orc.*; import org.apache.orc.OrcProto; import org.apache.orc.impl.*; import org.apache.orc.impl.StreamName; @@ -51,7 +50,7 @@ public class OrcRecordReader implements Closeable { private final Path path; private final long firstRow; - private final List stripes = new ArrayList<>(); + private final List stripes = new ArrayList<>(); private OrcProto.StripeFooter stripeFooter; private final long totalRowCount; private final CompressionCodec codec; @@ -69,12 +68,12 @@ public class OrcRecordReader implements Closeable { private final OrcProto.RowIndex[] indexes; private final OrcProto.BloomFilterIndex[] bloomFilterIndices; private final Configuration conf; - private final org.apache.tajo.storage.thirdparty.orc.MetadataReader metadata; + private final MetadataReader metadata; private final DataReader dataReader; private final Tuple result; public OrcRecordReader(TableMeta meta, - List stripes, + List stripes, FileSystem fileSystem, Schema schema, Column[] target, @@ -101,13 +100,13 @@ public OrcRecordReader(TableMeta meta, included[i] = targetSchema.contains(schema.getColumn(i - 1)); } this.rowIndexStride = strideRate; - this.metadata = new org.apache.tajo.storage.thirdparty.orc.MetadataReader(fileSystem, path, codec, bufferSize, types.size()); + this.metadata = new MetadataReaderImpl(fileSystem, path, codec, bufferSize, types.size()); long rows = 0; long skippedRows = 0; long offset = fragment.getStartKey(); long maxOffset = fragment.getStartKey() + fragment.getLength(); - for(OrcProto.StripeInformation stripe: stripes) { + for(StripeInformation stripe: stripes) { long stripeStart = stripe.getOffset(); if (offset > stripeStart) { skippedRows += stripe.getNumberOfRows(); @@ -191,7 +190,7 @@ void createStreams(List streamDescriptions, } } - private void readPartialDataStreams(OrcProto.StripeInformation stripe) throws IOException { + private void readPartialDataStreams(StripeInformation stripe) throws IOException { List streamList = stripeFooter.getStreamsList(); DiskRangeList toRead = planReadPartialDataStreams(streamList, included, true); if (LOG.isDebugEnabled()) { @@ -283,7 +282,7 @@ private void advanceStripe() throws IOException { * @throws IOException */ private void readStripe() throws IOException { - OrcProto.StripeInformation stripe = beginReadStripe(); + StripeInformation stripe = beginReadStripe(); // if we haven't skipped the whole stripe, read the data if (rowInStripe < rowCountInStripe) { @@ -323,12 +322,12 @@ private void clearStreams() throws IOException { streams.clear(); } - OrcProto.StripeFooter readStripeFooter(OrcProto.StripeInformation stripe) throws IOException { + OrcProto.StripeFooter readStripeFooter(StripeInformation stripe) throws IOException { return metadata.readStripeFooter(stripe); } - private OrcProto.StripeInformation beginReadStripe() throws IOException { - OrcProto.StripeInformation stripe = stripes.get(currentStripe); + private StripeInformation beginReadStripe() throws IOException { + StripeInformation stripe = stripes.get(currentStripe); stripeFooter = readStripeFooter(stripe); clearStreams(); // setup the position in the stripe @@ -345,7 +344,7 @@ private OrcProto.StripeInformation beginReadStripe() throws IOException { return stripe; } - private void readAllDataStreams(OrcProto.StripeInformation stripe) throws IOException { + private void readAllDataStreams(StripeInformation stripe) throws IOException { long start = stripe.getIndexLength(); long end = start + stripe.getDataLength(); // explicitly trigger 1 big read @@ -365,7 +364,7 @@ public float getProgress() { private int findStripe(long rowNumber) { for (int i = 0; i < stripes.size(); i++) { - OrcProto.StripeInformation stripe = stripes.get(i); + StripeInformation stripe = stripes.get(i); if (stripe.getNumberOfRows() > rowNumber) { return i; } @@ -381,7 +380,7 @@ OrcIndex readRowIndex( OrcIndex readRowIndex(int stripeIndex, boolean[] included, OrcProto.RowIndex[] indexes, OrcProto.BloomFilterIndex[] bloomFilterIndex) throws IOException { - OrcProto.StripeInformation stripe = stripes.get(stripeIndex); + StripeInformation stripe = stripes.get(stripeIndex); OrcProto.StripeFooter stripeFooter = null; // if this is the current stripe, use the cached objects. if (stripeIndex == currentStripe) { diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java index 5c7fa458ee..91e4dc60d4 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java @@ -17,14 +17,22 @@ */ package org.apache.tajo.storage.thirdparty.orc; +import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.serde2.objectinspector.*; -import org.apache.orc.*; +import org.apache.hadoop.hive.serde2.typeinfo.*; import org.apache.orc.CompressionCodec; -import org.apache.orc.impl.*; +import org.apache.orc.TypeDescription; +import org.apache.orc.TypeDescription.Category; import org.apache.orc.impl.SnappyCodec; +import org.apache.tajo.catalog.Column; +import org.apache.tajo.catalog.Schema; +import org.apache.tajo.catalog.TypeDesc; +import org.apache.tajo.common.TajoDataTypes.Type; +import org.apache.tajo.exception.TajoRuntimeException; +import org.apache.tajo.exception.UnsupportedDataTypeException; import java.util.Arrays; import java.util.HashMap; @@ -233,4 +241,55 @@ public static org.apache.orc.CompressionCodec createCodec(org.apache.orc.Compres kind); } } + + public static TypeDescription convertSchema(Schema schema) { + TypeDescription description = TypeDescription.createStruct(); + + for (Column eachColumn : schema.getRootColumns()) { + description.addField(eachColumn.getQualifiedName(), + convertTypeInfo(eachColumn.getTypeDesc())); + } + return description; + } + + public static TypeDescription convertTypeInfo(TypeDesc desc) { + switch (desc.getDataType().getType()) { + case BOOLEAN: + return TypeDescription.createBoolean(); + case BIT: + return TypeDescription.createByte(); + case INT2: + return TypeDescription.createShort(); + case INT4: + case INET4: + return TypeDescription.createInt(); + case INT8: + return TypeDescription.createLong(); + case FLOAT4: + return TypeDescription.createFloat(); + case FLOAT8: + return TypeDescription.createDouble(); + case TEXT: + return TypeDescription.createString(); + case DATE: + return TypeDescription.createDate(); + case TIMESTAMP: + return TypeDescription.createTimestamp(); + case BLOB: + return TypeDescription.createBinary(); + case CHAR: + return TypeDescription.createChar() + .withMaxLength(desc.getDataType().getLength()); + case RECORD: { + TypeDescription result = TypeDescription.createStruct(); + for (Column eachColumn : desc.getNestedSchema().getRootColumns()) { + result.addField(eachColumn.getQualifiedName(), + convertTypeInfo(eachColumn.getTypeDesc())); + } + return result; + } + default: + throw new TajoRuntimeException(new UnsupportedDataTypeException(desc.getDataType().getType().name())); + } + } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OutStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OutStream.java deleted file mode 100644 index f6cfd579b0..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OutStream.java +++ /dev/null @@ -1,286 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import java.io.IOException; -import java.nio.ByteBuffer; - -class OutStream extends PositionedOutputStream { - - interface OutputReceiver { - /** - * Output the given buffer to the final destination - * @param buffer the buffer to output - * @throws IOException - */ - void output(ByteBuffer buffer) throws IOException; - } - - static final int HEADER_SIZE = 3; - private final String name; - private final OutputReceiver receiver; - // if enabled the stream will be suppressed when writing stripe - private boolean suppress; - - /** - * Stores the uncompressed bytes that have been serialized, but not - * compressed yet. When this fills, we compress the entire buffer. - */ - private ByteBuffer current = null; - - /** - * Stores the compressed bytes until we have a full buffer and then outputs - * them to the receiver. If no compression is being done, this (and overflow) - * will always be null and the current buffer will be sent directly to the - * receiver. - */ - private ByteBuffer compressed = null; - - /** - * Since the compressed buffer may start with contents from previous - * compression blocks, we allocate an overflow buffer so that the - * output of the codec can be split between the two buffers. After the - * compressed buffer is sent to the receiver, the overflow buffer becomes - * the new compressed buffer. - */ - private ByteBuffer overflow = null; - private final int bufferSize; - private final CompressionCodec codec; - private long compressedBytes = 0; - private long uncompressedBytes = 0; - - OutStream(String name, - int bufferSize, - CompressionCodec codec, - OutputReceiver receiver) throws IOException { - this.name = name; - this.bufferSize = bufferSize; - this.codec = codec; - this.receiver = receiver; - this.suppress = false; - } - - public void clear() throws IOException { - flush(); - suppress = false; - } - - /** - * Write the length of the compressed bytes. Life is much easier if the - * header is constant length, so just use 3 bytes. Considering most of the - * codecs want between 32k (snappy) and 256k (lzo, zlib), 3 bytes should - * be plenty. We also use the low bit for whether it is the original or - * compressed bytes. - * @param buffer the buffer to write the header to - * @param position the position in the buffer to write at - * @param val the size in the file - * @param original is it uncompressed - */ - private static void writeHeader(ByteBuffer buffer, - int position, - int val, - boolean original) { - buffer.put(position, (byte) ((val << 1) + (original ? 1 : 0))); - buffer.put(position + 1, (byte) (val >> 7)); - buffer.put(position + 2, (byte) (val >> 15)); - } - - private void getNewInputBuffer() throws IOException { - if (codec == null) { - current = ByteBuffer.allocate(bufferSize); - } else { - current = ByteBuffer.allocate(bufferSize + HEADER_SIZE); - writeHeader(current, 0, bufferSize, true); - current.position(HEADER_SIZE); - } - } - - /** - * Allocate a new output buffer if we are compressing. - */ - private ByteBuffer getNewOutputBuffer() throws IOException { - return ByteBuffer.allocate(bufferSize + HEADER_SIZE); - } - - private void flip() throws IOException { - current.limit(current.position()); - current.position(codec == null ? 0 : HEADER_SIZE); - } - - @Override - public void write(int i) throws IOException { - if (current == null) { - getNewInputBuffer(); - } - if (current.remaining() < 1) { - spill(); - } - uncompressedBytes += 1; - current.put((byte) i); - } - - @Override - public void write(byte[] bytes, int offset, int length) throws IOException { - if (current == null) { - getNewInputBuffer(); - } - int remaining = Math.min(current.remaining(), length); - current.put(bytes, offset, remaining); - uncompressedBytes += remaining; - length -= remaining; - while (length != 0) { - spill(); - offset += remaining; - remaining = Math.min(current.remaining(), length); - current.put(bytes, offset, remaining); - uncompressedBytes += remaining; - length -= remaining; - } - } - - private void spill() throws IOException { - // if there isn't anything in the current buffer, don't spill - if (current == null || - current.position() == (codec == null ? 0 : HEADER_SIZE)) { - return; - } - flip(); - if (codec == null) { - receiver.output(current); - getNewInputBuffer(); - } else { - if (compressed == null) { - compressed = getNewOutputBuffer(); - } else if (overflow == null) { - overflow = getNewOutputBuffer(); - } - int sizePosn = compressed.position(); - compressed.position(compressed.position() + HEADER_SIZE); - if (codec.compress(current, compressed, overflow)) { - uncompressedBytes = 0; - // move position back to after the header - current.position(HEADER_SIZE); - current.limit(current.capacity()); - // find the total bytes in the chunk - int totalBytes = compressed.position() - sizePosn - HEADER_SIZE; - if (overflow != null) { - totalBytes += overflow.position(); - } - compressedBytes += totalBytes + HEADER_SIZE; - writeHeader(compressed, sizePosn, totalBytes, false); - // if we have less than the next header left, spill it. - if (compressed.remaining() < HEADER_SIZE) { - compressed.flip(); - receiver.output(compressed); - compressed = overflow; - overflow = null; - } - } else { - compressedBytes += uncompressedBytes + HEADER_SIZE; - uncompressedBytes = 0; - // we are using the original, but need to spill the current - // compressed buffer first. So back up to where we started, - // flip it and add it to done. - if (sizePosn != 0) { - compressed.position(sizePosn); - compressed.flip(); - receiver.output(compressed); - compressed = null; - // if we have an overflow, clear it and make it the new compress - // buffer - if (overflow != null) { - overflow.clear(); - compressed = overflow; - overflow = null; - } - } else { - compressed.clear(); - if (overflow != null) { - overflow.clear(); - } - } - - // now add the current buffer into the done list and get a new one. - current.position(0); - // update the header with the current length - writeHeader(current, 0, current.limit() - HEADER_SIZE, true); - receiver.output(current); - getNewInputBuffer(); - } - } - } - - void getPosition(PositionRecorder recorder) throws IOException { - if (codec == null) { - recorder.addPosition(uncompressedBytes); - } else { - recorder.addPosition(compressedBytes); - recorder.addPosition(uncompressedBytes); - } - } - - @Override - public void flush() throws IOException { - spill(); - if (compressed != null && compressed.position() != 0) { - compressed.flip(); - receiver.output(compressed); - compressed = null; - } - uncompressedBytes = 0; - compressedBytes = 0; - overflow = null; - current = null; - } - - @Override - public String toString() { - return name; - } - - @Override - public long getBufferSize() { - long result = 0; - if (current != null) { - result += current.capacity(); - } - if (compressed != null) { - result += compressed.capacity(); - } - if (overflow != null) { - result += overflow.capacity(); - } - return result; - } - - /** - * Set suppress flag - */ - public void suppress() { - suppress = true; - } - - /** - * Returns the state of suppress flag - * @return value of suppress flag - */ - public boolean isSuppressed() { - return suppress; - } -} - diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionRecorder.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionRecorder.java deleted file mode 100644 index a39926e005..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionRecorder.java +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -/** - * An interface for recording positions in a stream. - */ -interface PositionRecorder { - void addPosition(long offset); -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionedOutputStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionedOutputStream.java deleted file mode 100644 index 748c98cfbb..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionedOutputStream.java +++ /dev/null @@ -1,38 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import java.io.IOException; -import java.io.OutputStream; - -abstract class PositionedOutputStream extends OutputStream { - - /** - * Record the current position to the recorder. - * @param recorder the object that receives the position - * @throws IOException - */ - abstract void getPosition(PositionRecorder recorder) throws IOException; - - /** - * Get the memory size currently allocated as buffer associated with this - * stream. - * @return the number of bytes used by buffers. - */ - abstract long getBufferSize(); -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RedBlackTree.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RedBlackTree.java deleted file mode 100644 index 2482f93b0b..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RedBlackTree.java +++ /dev/null @@ -1,309 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.thirdparty.orc; - -/** - * A memory efficient red-black tree that does not allocate any objects per - * an element. This class is abstract and assumes that the child class - * handles the key and comparisons with the key. - */ -abstract class RedBlackTree { - public static final int NULL = -1; - - // Various values controlling the offset of the data within the array. - private static final int LEFT_OFFSET = 0; - private static final int RIGHT_OFFSET = 1; - private static final int ELEMENT_SIZE = 2; - - protected int size = 0; - private final DynamicIntArray data; - protected int root = NULL; - protected int lastAdd = 0; - private boolean wasAdd = false; - - /** - * Create a set with the given initial capacity. - */ - public RedBlackTree(int initialCapacity) { - data = new DynamicIntArray(initialCapacity * ELEMENT_SIZE); - } - - /** - * Insert a new node into the data array, growing the array as necessary. - * - * @return Returns the position of the new node. - */ - private int insert(int left, int right, boolean isRed) { - int position = size; - size += 1; - setLeft(position, left, isRed); - setRight(position, right); - return position; - } - - /** - * Compare the value at the given position to the new value. - * @return 0 if the values are the same, -1 if the new value is smaller and - * 1 if the new value is larger. - */ - protected abstract int compareValue(int position); - - /** - * Is the given node red as opposed to black? To prevent having an extra word - * in the data array, we just the low bit on the left child index. - */ - protected boolean isRed(int position) { - return position != NULL && - (data.get(position * ELEMENT_SIZE + LEFT_OFFSET) & 1) == 1; - } - - /** - * Set the red bit true or false. - */ - private void setRed(int position, boolean isRed) { - int offset = position * ELEMENT_SIZE + LEFT_OFFSET; - if (isRed) { - data.set(offset, data.get(offset) | 1); - } else { - data.set(offset, data.get(offset) & ~1); - } - } - - /** - * Get the left field of the given position. - */ - protected int getLeft(int position) { - return data.get(position * ELEMENT_SIZE + LEFT_OFFSET) >> 1; - } - - /** - * Get the right field of the given position. - */ - protected int getRight(int position) { - return data.get(position * ELEMENT_SIZE + RIGHT_OFFSET); - } - - /** - * Set the left field of the given position. - * Note that we are storing the node color in the low bit of the left pointer. - */ - private void setLeft(int position, int left) { - int offset = position * ELEMENT_SIZE + LEFT_OFFSET; - data.set(offset, (left << 1) | (data.get(offset) & 1)); - } - - /** - * Set the left field of the given position. - * Note that we are storing the node color in the low bit of the left pointer. - */ - private void setLeft(int position, int left, boolean isRed) { - int offset = position * ELEMENT_SIZE + LEFT_OFFSET; - data.set(offset, (left << 1) | (isRed ? 1 : 0)); - } - - /** - * Set the right field of the given position. - */ - private void setRight(int position, int right) { - data.set(position * ELEMENT_SIZE + RIGHT_OFFSET, right); - } - - /** - * Insert or find a given key in the tree and rebalance the tree correctly. - * Rebalancing restores the red-black aspect of the tree to maintain the - * invariants: - * 1. If a node is red, both of its children are black. - * 2. Each child of a node has the same black height (the number of black - * nodes between it and the leaves of the tree). - * - * Inserted nodes are at the leaves and are red, therefore there is at most a - * violation of rule 1 at the node we just put in. Instead of always keeping - * the parents, this routine passing down the context. - * - * The fix is broken down into 6 cases (1.{1,2,3} and 2.{1,2,3} that are - * left-right mirror images of each other). See Algorighms by Cormen, - * Leiserson, and Rivest for the explaination of the subcases. - * - * @param node The node that we are fixing right now. - * @param fromLeft Did we come down from the left? - * @param parent Nodes' parent - * @param grandparent Parent's parent - * @param greatGrandparent Grandparent's parent - * @return Does parent also need to be checked and/or fixed? - */ - private boolean add(int node, boolean fromLeft, int parent, - int grandparent, int greatGrandparent) { - if (node == NULL) { - if (root == NULL) { - lastAdd = insert(NULL, NULL, false); - root = lastAdd; - wasAdd = true; - return false; - } else { - lastAdd = insert(NULL, NULL, true); - node = lastAdd; - wasAdd = true; - // connect the new node into the tree - if (fromLeft) { - setLeft(parent, node); - } else { - setRight(parent, node); - } - } - } else { - int compare = compareValue(node); - boolean keepGoing; - - // Recurse down to find where the node needs to be added - if (compare < 0) { - keepGoing = add(getLeft(node), true, node, parent, grandparent); - } else if (compare > 0) { - keepGoing = add(getRight(node), false, node, parent, grandparent); - } else { - lastAdd = node; - wasAdd = false; - return false; - } - - // we don't need to fix the root (because it is always set to black) - if (node == root || !keepGoing) { - return false; - } - } - - - // Do we need to fix this node? Only if there are two reds right under each - // other. - if (isRed(node) && isRed(parent)) { - if (parent == getLeft(grandparent)) { - int uncle = getRight(grandparent); - if (isRed(uncle)) { - // case 1.1 - setRed(parent, false); - setRed(uncle, false); - setRed(grandparent, true); - return true; - } else { - if (node == getRight(parent)) { - // case 1.2 - // swap node and parent - int tmp = node; - node = parent; - parent = tmp; - // left-rotate on node - setLeft(grandparent, parent); - setRight(node, getLeft(parent)); - setLeft(parent, node); - } - - // case 1.2 and 1.3 - setRed(parent, false); - setRed(grandparent, true); - - // right-rotate on grandparent - if (greatGrandparent == NULL) { - root = parent; - } else if (getLeft(greatGrandparent) == grandparent) { - setLeft(greatGrandparent, parent); - } else { - setRight(greatGrandparent, parent); - } - setLeft(grandparent, getRight(parent)); - setRight(parent, grandparent); - return false; - } - } else { - int uncle = getLeft(grandparent); - if (isRed(uncle)) { - // case 2.1 - setRed(parent, false); - setRed(uncle, false); - setRed(grandparent, true); - return true; - } else { - if (node == getLeft(parent)) { - // case 2.2 - // swap node and parent - int tmp = node; - node = parent; - parent = tmp; - // right-rotate on node - setRight(grandparent, parent); - setLeft(node, getRight(parent)); - setRight(parent, node); - } - // case 2.2 and 2.3 - setRed(parent, false); - setRed(grandparent, true); - // left-rotate on grandparent - if (greatGrandparent == NULL) { - root = parent; - } else if (getRight(greatGrandparent) == grandparent) { - setRight(greatGrandparent, parent); - } else { - setLeft(greatGrandparent, parent); - } - setRight(grandparent, getLeft(parent)); - setLeft(parent, grandparent); - return false; - } - } - } else { - return true; - } - } - - /** - * Add the new key to the tree. - * @return true if the element is a new one. - */ - protected boolean add() { - add(root, false, NULL, NULL, NULL); - if (wasAdd) { - setRed(root, false); - return true; - } else { - return false; - } - } - - /** - * Get the number of elements in the set. - */ - public int size() { - return size; - } - - /** - * Reset the table to empty. - */ - public void clear() { - root = NULL; - size = 0; - data.clear(); - } - - /** - * Get the buffer size in bytes. - */ - public long getSizeInBytes() { - return data.getSizeInBytes(); - } -} - diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthByteWriter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthByteWriter.java deleted file mode 100644 index 0953cdd2a1..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthByteWriter.java +++ /dev/null @@ -1,106 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import java.io.IOException; - -/** - * A streamFactory that writes a sequence of bytes. A control byte is written before - * each run with positive values 0 to 127 meaning 2 to 129 repetitions. If the - * bytes is -1 to -128, 1 to 128 literal byte values follow. - */ -class RunLengthByteWriter { - static final int MIN_REPEAT_SIZE = 3; - static final int MAX_LITERAL_SIZE = 128; - static final int MAX_REPEAT_SIZE= 127 + MIN_REPEAT_SIZE; - private final PositionedOutputStream output; - private final byte[] literals = new byte[MAX_LITERAL_SIZE]; - private int numLiterals = 0; - private boolean repeat = false; - private int tailRunLength = 0; - - RunLengthByteWriter(PositionedOutputStream output) { - this.output = output; - } - - private void writeValues() throws IOException { - if (numLiterals != 0) { - if (repeat) { - output.write(numLiterals - MIN_REPEAT_SIZE); - output.write(literals, 0, 1); - } else { - output.write(-numLiterals); - output.write(literals, 0, numLiterals); - } - repeat = false; - tailRunLength = 0; - numLiterals = 0; - } - } - - void flush() throws IOException { - writeValues(); - output.flush(); - } - - void write(byte value) throws IOException { - if (numLiterals == 0) { - literals[numLiterals++] = value; - tailRunLength = 1; - } else if (repeat) { - if (value == literals[0]) { - numLiterals += 1; - if (numLiterals == MAX_REPEAT_SIZE) { - writeValues(); - } - } else { - writeValues(); - literals[numLiterals++] = value; - tailRunLength = 1; - } - } else { - if (value == literals[numLiterals - 1]) { - tailRunLength += 1; - } else { - tailRunLength = 1; - } - if (tailRunLength == MIN_REPEAT_SIZE) { - if (numLiterals + 1 == MIN_REPEAT_SIZE) { - repeat = true; - numLiterals += 1; - } else { - numLiterals -= MIN_REPEAT_SIZE - 1; - writeValues(); - literals[0] = value; - repeat = true; - numLiterals = MIN_REPEAT_SIZE; - } - } else { - literals[numLiterals++] = value; - if (numLiterals == MAX_LITERAL_SIZE) { - writeValues(); - } - } - } - } - - void getPosition(PositionRecorder recorder) throws IOException { - output.getPosition(recorder); - recorder.addPosition(numLiterals); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthIntegerWriter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthIntegerWriter.java deleted file mode 100644 index 867f041912..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthIntegerWriter.java +++ /dev/null @@ -1,143 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import java.io.IOException; - -/** - * A streamFactory that writes a sequence of integers. A control byte is written before - * each run with positive values 0 to 127 meaning 3 to 130 repetitions, each - * repetition is offset by a delta. If the control byte is -1 to -128, 1 to 128 - * literal vint values follow. - */ -class RunLengthIntegerWriter implements IntegerWriter { - static final int MIN_REPEAT_SIZE = 3; - static final int MAX_DELTA = 127; - static final int MIN_DELTA = -128; - static final int MAX_LITERAL_SIZE = 128; - private static final int MAX_REPEAT_SIZE = 127 + MIN_REPEAT_SIZE; - private final PositionedOutputStream output; - private final boolean signed; - private final long[] literals = new long[MAX_LITERAL_SIZE]; - private int numLiterals = 0; - private long delta = 0; - private boolean repeat = false; - private int tailRunLength = 0; - private SerializationUtils utils; - - RunLengthIntegerWriter(PositionedOutputStream output, - boolean signed) { - this.output = output; - this.signed = signed; - this.utils = new SerializationUtils(); - } - - private void writeValues() throws IOException { - if (numLiterals != 0) { - if (repeat) { - output.write(numLiterals - MIN_REPEAT_SIZE); - output.write((byte) delta); - if (signed) { - utils.writeVslong(output, literals[0]); - } else { - utils.writeVulong(output, literals[0]); - } - } else { - output.write(-numLiterals); - for(int i=0; i < numLiterals; ++i) { - if (signed) { - utils.writeVslong(output, literals[i]); - } else { - utils.writeVulong(output, literals[i]); - } - } - } - repeat = false; - numLiterals = 0; - tailRunLength = 0; - } - } - - @Override - public void flush() throws IOException { - writeValues(); - output.flush(); - } - - @Override - public void write(long value) throws IOException { - if (numLiterals == 0) { - literals[numLiterals++] = value; - tailRunLength = 1; - } else if (repeat) { - if (value == literals[0] + delta * numLiterals) { - numLiterals += 1; - if (numLiterals == MAX_REPEAT_SIZE) { - writeValues(); - } - } else { - writeValues(); - literals[numLiterals++] = value; - tailRunLength = 1; - } - } else { - if (tailRunLength == 1) { - delta = value - literals[numLiterals - 1]; - if (delta < MIN_DELTA || delta > MAX_DELTA) { - tailRunLength = 1; - } else { - tailRunLength = 2; - } - } else if (value == literals[numLiterals - 1] + delta) { - tailRunLength += 1; - } else { - delta = value - literals[numLiterals - 1]; - if (delta < MIN_DELTA || delta > MAX_DELTA) { - tailRunLength = 1; - } else { - tailRunLength = 2; - } - } - if (tailRunLength == MIN_REPEAT_SIZE) { - if (numLiterals + 1 == MIN_REPEAT_SIZE) { - repeat = true; - numLiterals += 1; - } else { - numLiterals -= MIN_REPEAT_SIZE - 1; - long base = literals[numLiterals]; - writeValues(); - literals[0] = base; - repeat = true; - numLiterals = MIN_REPEAT_SIZE; - } - } else { - literals[numLiterals++] = value; - if (numLiterals == MAX_LITERAL_SIZE) { - writeValues(); - } - } - } - } - - @Override - public void getPosition(PositionRecorder recorder) throws IOException { - output.getPosition(recorder); - recorder.addPosition(numLiterals); - } - -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthIntegerWriterV2.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthIntegerWriterV2.java deleted file mode 100644 index 7237b2e29d..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthIntegerWriterV2.java +++ /dev/null @@ -1,832 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import java.io.IOException; - -/** - * A writer that performs light weight compression over sequence of integers. - *

- * There are four types of lightweight integer compression - *

    - *
  • SHORT_REPEAT
  • - *
  • DIRECT
  • - *
  • PATCHED_BASE
  • - *
  • DELTA
  • - *
- *

- * The description and format for these types are as below: - *

- * SHORT_REPEAT: Used for short repeated integer sequences. - *

    - *
  • 1 byte header - *
      - *
    • 2 bits for encoding type
    • - *
    • 3 bits for bytes required for repeating value
    • - *
    • 3 bits for repeat count (MIN_REPEAT + run length)
    • - *
    - *
  • - *
  • Blob - repeat value (fixed bytes)
  • - *
- *

- *

- * DIRECT: Used for random integer sequences whose number of bit - * requirement doesn't vary a lot. - *

    - *
  • 2 bytes header - *
      - * 1st byte - *
    • 2 bits for encoding type
    • - *
    • 5 bits for fixed bit width of values in blob
    • - *
    • 1 bit for storing MSB of run length
    • - *
    - *
      - * 2nd byte - *
    • 8 bits for lower run length bits
    • - *
    - *
  • - *
  • Blob - stores the direct values using fixed bit width. The length of the - * data blob is (fixed width * run length) bits long
  • - *
- *

- *

- * PATCHED_BASE: Used for random integer sequences whose number of bit - * requirement varies beyond a threshold. - *

    - *
  • 4 bytes header - *
      - * 1st byte - *
    • 2 bits for encoding type
    • - *
    • 5 bits for fixed bit width of values in blob
    • - *
    • 1 bit for storing MSB of run length
    • - *
    - *
      - * 2nd byte - *
    • 8 bits for lower run length bits
    • - *
    - *
      - * 3rd byte - *
    • 3 bits for bytes required to encode base value
    • - *
    • 5 bits for patch width
    • - *
    - *
      - * 4th byte - *
    • 3 bits for patch gap width
    • - *
    • 5 bits for patch length
    • - *
    - *
  • - *
  • Base value - Stored using fixed number of bytes. If MSB is set, base - * value is negative else positive. Length of base value is (base width * 8) - * bits.
  • - *
  • Data blob - Base reduced values as stored using fixed bit width. Length - * of data blob is (fixed width * run length) bits.
  • - *
  • Patch blob - Patch blob is a list of gap and patch value. Each entry in - * the patch list is (patch width + patch gap width) bits long. Gap between the - * subsequent elements to be patched are stored in upper part of entry whereas - * patch values are stored in lower part of entry. Length of patch blob is - * ((patch width + patch gap width) * patch length) bits.
  • - *
- *

- *

- * DELTA Used for monotonically increasing or decreasing sequences, - * sequences with fixed delta values or long repeated sequences. - *

    - *
  • 2 bytes header - *
      - * 1st byte - *
    • 2 bits for encoding type
    • - *
    • 5 bits for fixed bit width of values in blob
    • - *
    • 1 bit for storing MSB of run length
    • - *
    - *
      - * 2nd byte - *
    • 8 bits for lower run length bits
    • - *
    - *
  • - *
  • Base value - encoded as varint
  • - *
  • Delta base - encoded as varint
  • - *
  • Delta blob - only positive values. monotonicity and orderness are decided - * based on the sign of the base value and delta base
  • - *
- *

- */ -class RunLengthIntegerWriterV2 implements IntegerWriter { - - public enum EncodingType { - SHORT_REPEAT, DIRECT, PATCHED_BASE, DELTA - } - - static final int MAX_SCOPE = 512; - static final int MIN_REPEAT = 3; - private static final int MAX_SHORT_REPEAT_LENGTH = 10; - private long prevDelta = 0; - private int fixedRunLength = 0; - private int variableRunLength = 0; - private final long[] literals = new long[MAX_SCOPE]; - private final PositionedOutputStream output; - private final boolean signed; - private EncodingType encoding; - private int numLiterals; - private final long[] zigzagLiterals = new long[MAX_SCOPE]; - private final long[] baseRedLiterals = new long[MAX_SCOPE]; - private final long[] adjDeltas = new long[MAX_SCOPE]; - private long fixedDelta; - private int zzBits90p; - private int zzBits100p; - private int brBits95p; - private int brBits100p; - private int bitsDeltaMax; - private int patchWidth; - private int patchGapWidth; - private int patchLength; - private long[] gapVsPatchList; - private long min; - private boolean isFixedDelta; - private SerializationUtils utils; - private boolean alignedBitpacking; - - RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed) { - this(output, signed, true); - } - - RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed, - boolean alignedBitpacking) { - this.output = output; - this.signed = signed; - this.alignedBitpacking = alignedBitpacking; - this.utils = new SerializationUtils(); - clear(); - } - - private void writeValues() throws IOException { - if (numLiterals != 0) { - - if (encoding.equals(EncodingType.SHORT_REPEAT)) { - writeShortRepeatValues(); - } else if (encoding.equals(EncodingType.DIRECT)) { - writeDirectValues(); - } else if (encoding.equals(EncodingType.PATCHED_BASE)) { - writePatchedBaseValues(); - } else { - writeDeltaValues(); - } - - // clear all the variables - clear(); - } - } - - private void writeDeltaValues() throws IOException { - int len = 0; - int fb = bitsDeltaMax; - int efb = 0; - - if (alignedBitpacking) { - fb = utils.getClosestAlignedFixedBits(fb); - } - - if (isFixedDelta) { - // if fixed run length is greater than threshold then it will be fixed - // delta sequence with delta value 0 else fixed delta sequence with - // non-zero delta value - if (fixedRunLength > MIN_REPEAT) { - // ex. sequence: 2 2 2 2 2 2 2 2 - len = fixedRunLength - 1; - fixedRunLength = 0; - } else { - // ex. sequence: 4 6 8 10 12 14 16 - len = variableRunLength - 1; - variableRunLength = 0; - } - } else { - // fixed width 0 is used for long repeating values. - // sequences that require only 1 bit to encode will have an additional bit - if (fb == 1) { - fb = 2; - } - efb = utils.encodeBitWidth(fb); - efb = efb << 1; - len = variableRunLength - 1; - variableRunLength = 0; - } - - // extract the 9th bit of run length - final int tailBits = (len & 0x100) >>> 8; - - // create first byte of the header - final int headerFirstByte = getOpcode() | efb | tailBits; - - // second byte of the header stores the remaining 8 bits of runlength - final int headerSecondByte = len & 0xff; - - // write header - output.write(headerFirstByte); - output.write(headerSecondByte); - - // store the first value from zigzag literal array - if (signed) { - utils.writeVslong(output, literals[0]); - } else { - utils.writeVulong(output, literals[0]); - } - - if (isFixedDelta) { - // if delta is fixed then we don't need to store delta blob - utils.writeVslong(output, fixedDelta); - } else { - // store the first value as delta value using zigzag encoding - utils.writeVslong(output, adjDeltas[0]); - - // adjacent delta values are bit packed. The length of adjDeltas array is - // always one less than the number of literals (delta difference for n - // elements is n-1). We have already written one element, write the - // remaining numLiterals - 2 elements here - utils.writeInts(adjDeltas, 1, numLiterals - 2, fb, output); - } - } - - private void writePatchedBaseValues() throws IOException { - - // NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding - // because patch is applied to MSB bits. For example: If fixed bit width of - // base value is 7 bits and if patch is 3 bits, the actual value is - // constructed by shifting the patch to left by 7 positions. - // actual_value = patch << 7 | base_value - // So, if we align base_value then actual_value can not be reconstructed. - - // write the number of fixed bits required in next 5 bits - final int fb = brBits95p; - final int efb = utils.encodeBitWidth(fb) << 1; - - // adjust variable run length, they are one off - variableRunLength -= 1; - - // extract the 9th bit of run length - final int tailBits = (variableRunLength & 0x100) >>> 8; - - // create first byte of the header - final int headerFirstByte = getOpcode() | efb | tailBits; - - // second byte of the header stores the remaining 8 bits of runlength - final int headerSecondByte = variableRunLength & 0xff; - - // if the min value is negative toggle the sign - final boolean isNegative = min < 0 ? true : false; - if (isNegative) { - min = -min; - } - - // find the number of bytes required for base and shift it by 5 bits - // to accommodate patch width. The additional bit is used to store the sign - // of the base value. - final int baseWidth = utils.findClosestNumBits(min) + 1; - final int baseBytes = baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1; - final int bb = (baseBytes - 1) << 5; - - // if the base value is negative then set MSB to 1 - if (isNegative) { - min |= (1L << ((baseBytes * 8) - 1)); - } - - // third byte contains 3 bits for number of bytes occupied by base - // and 5 bits for patchWidth - final int headerThirdByte = bb | utils.encodeBitWidth(patchWidth); - - // fourth byte contains 3 bits for page gap width and 5 bits for - // patch length - final int headerFourthByte = (patchGapWidth - 1) << 5 | patchLength; - - // write header - output.write(headerFirstByte); - output.write(headerSecondByte); - output.write(headerThirdByte); - output.write(headerFourthByte); - - // write the base value using fixed bytes in big endian order - for(int i = baseBytes - 1; i >= 0; i--) { - byte b = (byte) ((min >>> (i * 8)) & 0xff); - output.write(b); - } - - // base reduced literals are bit packed - int closestFixedBits = utils.getClosestFixedBits(fb); - - utils.writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits, - output); - - // write patch list - closestFixedBits = utils.getClosestFixedBits(patchGapWidth + patchWidth); - - utils.writeInts(gapVsPatchList, 0, gapVsPatchList.length, closestFixedBits, - output); - - // reset run length - variableRunLength = 0; - } - - /** - * Store the opcode in 2 MSB bits - * @return opcode - */ - private int getOpcode() { - return encoding.ordinal() << 6; - } - - private void writeDirectValues() throws IOException { - - // write the number of fixed bits required in next 5 bits - int fb = zzBits100p; - - if (alignedBitpacking) { - fb = utils.getClosestAlignedFixedBits(fb); - } - - final int efb = utils.encodeBitWidth(fb) << 1; - - // adjust variable run length - variableRunLength -= 1; - - // extract the 9th bit of run length - final int tailBits = (variableRunLength & 0x100) >>> 8; - - // create first byte of the header - final int headerFirstByte = getOpcode() | efb | tailBits; - - // second byte of the header stores the remaining 8 bits of runlength - final int headerSecondByte = variableRunLength & 0xff; - - // write header - output.write(headerFirstByte); - output.write(headerSecondByte); - - // bit packing the zigzag encoded literals - utils.writeInts(zigzagLiterals, 0, numLiterals, fb, output); - - // reset run length - variableRunLength = 0; - } - - private void writeShortRepeatValues() throws IOException { - // get the value that is repeating, compute the bits and bytes required - long repeatVal = 0; - if (signed) { - repeatVal = utils.zigzagEncode(literals[0]); - } else { - repeatVal = literals[0]; - } - - final int numBitsRepeatVal = utils.findClosestNumBits(repeatVal); - final int numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? numBitsRepeatVal >>> 3 - : (numBitsRepeatVal >>> 3) + 1; - - // write encoding type in top 2 bits - int header = getOpcode(); - - // write the number of bytes required for the value - header |= ((numBytesRepeatVal - 1) << 3); - - // write the run length - fixedRunLength -= MIN_REPEAT; - header |= fixedRunLength; - - // write the header - output.write(header); - - // write the repeating value in big endian byte order - for(int i = numBytesRepeatVal - 1; i >= 0; i--) { - int b = (int) ((repeatVal >>> (i * 8)) & 0xff); - output.write(b); - } - - fixedRunLength = 0; - } - - private void determineEncoding() { - - // we need to compute zigzag values for DIRECT encoding if we decide to - // break early for delta overflows or for shorter runs - computeZigZagLiterals(); - - zzBits100p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 1.0); - - // not a big win for shorter runs to determine encoding - if (numLiterals <= MIN_REPEAT) { - encoding = EncodingType.DIRECT; - return; - } - - // DELTA encoding check - - // for identifying monotonic sequences - boolean isIncreasing = true; - boolean isDecreasing = true; - this.isFixedDelta = true; - - this.min = literals[0]; - long max = literals[0]; - final long initialDelta = literals[1] - literals[0]; - long currDelta = initialDelta; - long deltaMax = initialDelta; - this.adjDeltas[0] = initialDelta; - - for (int i = 1; i < numLiterals; i++) { - final long l1 = literals[i]; - final long l0 = literals[i - 1]; - currDelta = l1 - l0; - min = Math.min(min, l1); - max = Math.max(max, l1); - - isIncreasing &= (l0 <= l1); - isDecreasing &= (l0 >= l1); - - isFixedDelta &= (currDelta == initialDelta); - if (i > 1) { - adjDeltas[i - 1] = Math.abs(currDelta); - deltaMax = Math.max(deltaMax, adjDeltas[i - 1]); - } - } - - // its faster to exit under delta overflow condition without checking for - // PATCHED_BASE condition as encoding using DIRECT is faster and has less - // overhead than PATCHED_BASE - if (!utils.isSafeSubtract(max, min)) { - encoding = EncodingType.DIRECT; - return; - } - - // invariant - subtracting any number from any other in the literals after - // this point won't overflow - - // if initialDelta is 0 then we cannot delta encode as we cannot identify - // the sign of deltas (increasing or decreasing) - if (initialDelta != 0) { - - // if min is equal to max then the delta is 0, this condition happens for - // fixed values run >10 which cannot be encoded with SHORT_REPEAT - if (min == max) { - assert isFixedDelta : min + "==" + max + - ", isFixedDelta cannot be false"; - assert currDelta == 0 : min + "==" + max + ", currDelta should be zero"; - fixedDelta = 0; - encoding = EncodingType.DELTA; - return; - } - - if (isFixedDelta) { - assert currDelta == initialDelta - : "currDelta should be equal to initialDelta for fixed delta encoding"; - encoding = EncodingType.DELTA; - fixedDelta = currDelta; - return; - } - - // stores the number of bits required for packing delta blob in - // delta encoding - bitsDeltaMax = utils.findClosestNumBits(deltaMax); - - // monotonic condition - if (isIncreasing || isDecreasing) { - encoding = EncodingType.DELTA; - return; - } - } - - // PATCHED_BASE encoding check - - // percentile values are computed for the zigzag encoded values. if the - // number of bit requirement between 90th and 100th percentile varies - // beyond a threshold then we need to patch the values. if the variation - // is not significant then we can use direct encoding - - zzBits90p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 0.9); - int diffBitsLH = zzBits100p - zzBits90p; - - // if the difference between 90th percentile and 100th percentile fixed - // bits is > 1 then we need patch the values - if (diffBitsLH > 1) { - - // patching is done only on base reduced values. - // remove base from literals - for (int i = 0; i < numLiterals; i++) { - baseRedLiterals[i] = literals[i] - min; - } - - // 95th percentile width is used to determine max allowed value - // after which patching will be done - brBits95p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 0.95); - - // 100th percentile is used to compute the max patch width - brBits100p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 1.0); - - // after base reducing the values, if the difference in bits between - // 95th percentile and 100th percentile value is zero then there - // is no point in patching the values, in which case we will - // fallback to DIRECT encoding. - // The decision to use patched base was based on zigzag values, but the - // actual patching is done on base reduced literals. - if ((brBits100p - brBits95p) != 0) { - encoding = EncodingType.PATCHED_BASE; - preparePatchedBlob(); - return; - } else { - encoding = EncodingType.DIRECT; - return; - } - } else { - // if difference in bits between 95th percentile and 100th percentile is - // 0, then patch length will become 0. Hence we will fallback to direct - encoding = EncodingType.DIRECT; - return; - } - } - - private void computeZigZagLiterals() { - // populate zigzag encoded literals - long zzEncVal = 0; - for (int i = 0; i < numLiterals; i++) { - if (signed) { - zzEncVal = utils.zigzagEncode(literals[i]); - } else { - zzEncVal = literals[i]; - } - zigzagLiterals[i] = zzEncVal; - } - } - - private void preparePatchedBlob() { - // mask will be max value beyond which patch will be generated - long mask = (1L << brBits95p) - 1; - - // since we are considering only 95 percentile, the size of gap and - // patch array can contain only be 5% values - patchLength = (int) Math.ceil((numLiterals * 0.05)); - - int[] gapList = new int[patchLength]; - long[] patchList = new long[patchLength]; - - // #bit for patch - patchWidth = brBits100p - brBits95p; - patchWidth = utils.getClosestFixedBits(patchWidth); - - // if patch bit requirement is 64 then it will not possible to pack - // gap and patch together in a long. To make sure gap and patch can be - // packed together adjust the patch width - if (patchWidth == 64) { - patchWidth = 56; - brBits95p = 8; - mask = (1L << brBits95p) - 1; - } - - int gapIdx = 0; - int patchIdx = 0; - int prev = 0; - int gap = 0; - int maxGap = 0; - - for(int i = 0; i < numLiterals; i++) { - // if value is above mask then create the patch and record the gap - if (baseRedLiterals[i] > mask) { - gap = i - prev; - if (gap > maxGap) { - maxGap = gap; - } - - // gaps are relative, so store the previous patched value index - prev = i; - gapList[gapIdx++] = gap; - - // extract the most significant bits that are over mask bits - long patch = baseRedLiterals[i] >>> brBits95p; - patchList[patchIdx++] = patch; - - // strip off the MSB to enable safe bit packing - baseRedLiterals[i] &= mask; - } - } - - // adjust the patch length to number of entries in gap list - patchLength = gapIdx; - - // if the element to be patched is the first and only element then - // max gap will be 0, but to store the gap as 0 we need atleast 1 bit - if (maxGap == 0 && patchLength != 0) { - patchGapWidth = 1; - } else { - patchGapWidth = utils.findClosestNumBits(maxGap); - } - - // special case: if the patch gap width is greater than 256, then - // we need 9 bits to encode the gap width. But we only have 3 bits in - // header to record the gap width. To deal with this case, we will save - // two entries in patch list in the following way - // 256 gap width => 0 for patch value - // actual gap - 256 => actual patch value - // We will do the same for gap width = 511. If the element to be patched is - // the last element in the scope then gap width will be 511. In this case we - // will have 3 entries in the patch list in the following way - // 255 gap width => 0 for patch value - // 255 gap width => 0 for patch value - // 1 gap width => actual patch value - if (patchGapWidth > 8) { - patchGapWidth = 8; - // for gap = 511, we need two additional entries in patch list - if (maxGap == 511) { - patchLength += 2; - } else { - patchLength += 1; - } - } - - // create gap vs patch list - gapIdx = 0; - patchIdx = 0; - gapVsPatchList = new long[patchLength]; - for(int i = 0; i < patchLength; i++) { - long g = gapList[gapIdx++]; - long p = patchList[patchIdx++]; - while (g > 255) { - gapVsPatchList[i++] = (255L << patchWidth); - g -= 255; - } - - // store patch value in LSBs and gap in MSBs - gapVsPatchList[i] = (g << patchWidth) | p; - } - } - - /** - * clears all the variables - */ - private void clear() { - numLiterals = 0; - encoding = null; - prevDelta = 0; - fixedDelta = 0; - zzBits90p = 0; - zzBits100p = 0; - brBits95p = 0; - brBits100p = 0; - bitsDeltaMax = 0; - patchGapWidth = 0; - patchLength = 0; - patchWidth = 0; - gapVsPatchList = null; - min = 0; - isFixedDelta = true; - } - - @Override - public void flush() throws IOException { - if (numLiterals != 0) { - if (variableRunLength != 0) { - determineEncoding(); - writeValues(); - } else if (fixedRunLength != 0) { - if (fixedRunLength < MIN_REPEAT) { - variableRunLength = fixedRunLength; - fixedRunLength = 0; - determineEncoding(); - writeValues(); - } else if (fixedRunLength >= MIN_REPEAT - && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { - encoding = EncodingType.SHORT_REPEAT; - writeValues(); - } else { - encoding = EncodingType.DELTA; - isFixedDelta = true; - writeValues(); - } - } - } - output.flush(); - } - - @Override - public void write(long val) throws IOException { - if (numLiterals == 0) { - initializeLiterals(val); - } else { - if (numLiterals == 1) { - prevDelta = val - literals[0]; - literals[numLiterals++] = val; - // if both values are same count as fixed run else variable run - if (val == literals[0]) { - fixedRunLength = 2; - variableRunLength = 0; - } else { - fixedRunLength = 0; - variableRunLength = 2; - } - } else { - long currentDelta = val - literals[numLiterals - 1]; - if (prevDelta == 0 && currentDelta == 0) { - // fixed delta run - - literals[numLiterals++] = val; - - // if variable run is non-zero then we are seeing repeating - // values at the end of variable run in which case keep - // updating variable and fixed runs - if (variableRunLength > 0) { - fixedRunLength = 2; - } - fixedRunLength += 1; - - // if fixed run met the minimum condition and if variable - // run is non-zero then flush the variable run and shift the - // tail fixed runs to start of the buffer - if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) { - numLiterals -= MIN_REPEAT; - variableRunLength -= MIN_REPEAT - 1; - // copy the tail fixed runs - long[] tailVals = new long[MIN_REPEAT]; - System.arraycopy(literals, numLiterals, tailVals, 0, MIN_REPEAT); - - // determine variable encoding and flush values - determineEncoding(); - writeValues(); - - // shift tail fixed runs to beginning of the buffer - for(long l : tailVals) { - literals[numLiterals++] = l; - } - } - - // if fixed runs reached max repeat length then write values - if (fixedRunLength == MAX_SCOPE) { - determineEncoding(); - writeValues(); - } - } else { - // variable delta run - - // if fixed run length is non-zero and if it satisfies the - // short repeat conditions then write the values as short repeats - // else use delta encoding - if (fixedRunLength >= MIN_REPEAT) { - if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { - encoding = EncodingType.SHORT_REPEAT; - writeValues(); - } else { - encoding = EncodingType.DELTA; - isFixedDelta = true; - writeValues(); - } - } - - // if fixed run length is 0 && fixedRunLength < MIN_REPEAT) { - if (val != literals[numLiterals - 1]) { - variableRunLength = fixedRunLength; - fixedRunLength = 0; - } - } - - // after writing values re-initialize the variables - if (numLiterals == 0) { - initializeLiterals(val); - } else { - // keep updating variable run lengths - prevDelta = val - literals[numLiterals - 1]; - literals[numLiterals++] = val; - variableRunLength += 1; - - // if variable run length reach the max scope, write it - if (variableRunLength == MAX_SCOPE) { - determineEncoding(); - writeValues(); - } - } - } - } - } - } - - private void initializeLiterals(long val) { - literals[numLiterals++] = val; - fixedRunLength = 1; - variableRunLength = 1; - } - - @Override - public void getPosition(PositionRecorder recorder) throws IOException { - output.getPosition(recorder); - recorder.addPosition(numLiterals); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SerializationUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SerializationUtils.java deleted file mode 100644 index 53687b7fdb..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SerializationUtils.java +++ /dev/null @@ -1,844 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.thirdparty.orc; - -import java.io.EOFException; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.math.BigInteger; - -final class SerializationUtils { - - private final static int BUFFER_SIZE = 64; - private final byte[] readBuffer; - private final byte[] writeBuffer; - - public SerializationUtils() { - this.readBuffer = new byte[BUFFER_SIZE]; - this.writeBuffer = new byte[BUFFER_SIZE]; - } - - void writeVulong(OutputStream output, long value) throws IOException { - while (true) { - if ((value & ~0x7f) == 0) { - output.write((byte) value); - return; - } else { - output.write((byte) (0x80 | (value & 0x7f))); - value >>>= 7; - } - } - } - - void writeVslong(OutputStream output, long value) throws IOException { - writeVulong(output, (value << 1) ^ (value >> 63)); - } - - - long readVulong(InputStream in) throws IOException { - long result = 0; - long b; - int offset = 0; - do { - b = in.read(); - if (b == -1) { - throw new EOFException("Reading Vulong past EOF"); - } - result |= (0x7f & b) << offset; - offset += 7; - } while (b >= 0x80); - return result; - } - - long readVslong(InputStream in) throws IOException { - long result = readVulong(in); - return (result >>> 1) ^ -(result & 1); - } - - float readFloat(InputStream in) throws IOException { - int ser = in.read() | (in.read() << 8) | (in.read() << 16) | - (in.read() << 24); - return Float.intBitsToFloat(ser); - } - - void writeFloat(OutputStream output, float value) throws IOException { - int ser = Float.floatToIntBits(value); - output.write(ser & 0xff); - output.write((ser >> 8) & 0xff); - output.write((ser >> 16) & 0xff); - output.write((ser >> 24) & 0xff); - } - - double readDouble(InputStream in) throws IOException { - return Double.longBitsToDouble(readLongLE(in)); - } - - long readLongLE(InputStream in) throws IOException { - in.read(readBuffer, 0, 8); - return (((readBuffer[0] & 0xff) << 0) - + ((readBuffer[1] & 0xff) << 8) - + ((readBuffer[2] & 0xff) << 16) - + ((long) (readBuffer[3] & 0xff) << 24) - + ((long) (readBuffer[4] & 0xff) << 32) - + ((long) (readBuffer[5] & 0xff) << 40) - + ((long) (readBuffer[6] & 0xff) << 48) - + ((long) (readBuffer[7] & 0xff) << 56)); - } - - void writeDouble(OutputStream output, double value) throws IOException { - writeLongLE(output, Double.doubleToLongBits(value)); - } - - private void writeLongLE(OutputStream output, long value) throws IOException { - writeBuffer[0] = (byte) ((value >> 0) & 0xff); - writeBuffer[1] = (byte) ((value >> 8) & 0xff); - writeBuffer[2] = (byte) ((value >> 16) & 0xff); - writeBuffer[3] = (byte) ((value >> 24) & 0xff); - writeBuffer[4] = (byte) ((value >> 32) & 0xff); - writeBuffer[5] = (byte) ((value >> 40) & 0xff); - writeBuffer[6] = (byte) ((value >> 48) & 0xff); - writeBuffer[7] = (byte) ((value >> 56) & 0xff); - output.write(writeBuffer, 0, 8); - } - - /** - * Write the arbitrarily sized signed BigInteger in vint format. - * - * Signed integers are encoded using the low bit as the sign bit using zigzag - * encoding. - * - * Each byte uses the low 7 bits for data and the high bit for stop/continue. - * - * Bytes are stored LSB first. - * @param output the stream to write to - * @param value the value to output - * @throws IOException - */ - static void writeBigInteger(OutputStream output, - BigInteger value) throws IOException { - // encode the signed number as a positive integer - value = value.shiftLeft(1); - int sign = value.signum(); - if (sign < 0) { - value = value.negate(); - value = value.subtract(BigInteger.ONE); - } - int length = value.bitLength(); - while (true) { - long lowBits = value.longValue() & 0x7fffffffffffffffL; - length -= 63; - // write out the next 63 bits worth of data - for(int i=0; i < 9; ++i) { - // if this is the last byte, leave the high bit off - if (length <= 0 && (lowBits & ~0x7f) == 0) { - output.write((byte) lowBits); - return; - } else { - output.write((byte) (0x80 | (lowBits & 0x7f))); - lowBits >>>= 7; - } - } - value = value.shiftRight(63); - } - } - - /** - * Read the signed arbitrary sized BigInteger BigInteger in vint format - * @param input the stream to read from - * @return the read BigInteger - * @throws IOException - */ - static BigInteger readBigInteger(InputStream input) throws IOException { - BigInteger result = BigInteger.ZERO; - long work = 0; - int offset = 0; - long b; - do { - b = input.read(); - if (b == -1) { - throw new EOFException("Reading BigInteger past EOF from " + input); - } - work |= (0x7f & b) << (offset % 63); - offset += 7; - // if we've read 63 bits, roll them into the result - if (offset == 63) { - result = BigInteger.valueOf(work); - work = 0; - } else if (offset % 63 == 0) { - result = result.or(BigInteger.valueOf(work).shiftLeft(offset-63)); - work = 0; - } - } while (b >= 0x80); - if (work != 0) { - result = result.or(BigInteger.valueOf(work).shiftLeft((offset/63)*63)); - } - // convert back to a signed number - boolean isNegative = result.testBit(0); - if (isNegative) { - result = result.add(BigInteger.ONE); - result = result.negate(); - } - result = result.shiftRight(1); - return result; - } - - enum FixedBitSizes { - ONE, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE, - THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN, - TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX, - TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR; - } - - /** - * Count the number of bits required to encode the given value - * @param value - * @return bits required to store value - */ - int findClosestNumBits(long value) { - int count = 0; - while (value != 0) { - count++; - value = value >>> 1; - } - return getClosestFixedBits(count); - } - - /** - * zigzag encode the given value - * @param val - * @return zigzag encoded value - */ - long zigzagEncode(long val) { - return (val << 1) ^ (val >> 63); - } - - /** - * zigzag decode the given value - * @param val - * @return zizag decoded value - */ - long zigzagDecode(long val) { - return (val >>> 1) ^ -(val & 1); - } - - /** - * Compute the bits required to represent pth percentile value - * @param data - array - * @param p - percentile value (>=0.0 to <=1.0) - * @return pth percentile bits - */ - int percentileBits(long[] data, int offset, int length, double p) { - if ((p > 1.0) || (p <= 0.0)) { - return -1; - } - - // histogram that store the encoded bit requirement for each values. - // maximum number of bits that can encoded is 32 (refer FixedBitSizes) - int[] hist = new int[32]; - - // compute the histogram - for(int i = offset; i < (offset + length); i++) { - int idx = encodeBitWidth(findClosestNumBits(data[i])); - hist[idx] += 1; - } - - int perLen = (int) (length * (1.0 - p)); - - // return the bits required by pth percentile length - for(int i = hist.length - 1; i >= 0; i--) { - perLen -= hist[i]; - if (perLen < 0) { - return decodeBitWidth(i); - } - } - - return 0; - } - - /** - * Calculate the number of bytes required - * @param n - number of values - * @param numBits - bit width - * @return number of bytes required - */ - int getTotalBytesRequired(int n, int numBits) { - return (n * numBits + 7) / 8; - } - - /** - * For a given fixed bit this function will return the closest available fixed - * bit - * @param n - * @return closest valid fixed bit - */ - int getClosestFixedBits(int n) { - if (n == 0) { - return 1; - } - - if (n >= 1 && n <= 24) { - return n; - } else if (n > 24 && n <= 26) { - return 26; - } else if (n > 26 && n <= 28) { - return 28; - } else if (n > 28 && n <= 30) { - return 30; - } else if (n > 30 && n <= 32) { - return 32; - } else if (n > 32 && n <= 40) { - return 40; - } else if (n > 40 && n <= 48) { - return 48; - } else if (n > 48 && n <= 56) { - return 56; - } else { - return 64; - } - } - - public int getClosestAlignedFixedBits(int n) { - if (n == 0 || n == 1) { - return 1; - } else if (n > 1 && n <= 2) { - return 2; - } else if (n > 2 && n <= 4) { - return 4; - } else if (n > 4 && n <= 8) { - return 8; - } else if (n > 8 && n <= 16) { - return 16; - } else if (n > 16 && n <= 24) { - return 24; - } else if (n > 24 && n <= 32) { - return 32; - } else if (n > 32 && n <= 40) { - return 40; - } else if (n > 40 && n <= 48) { - return 48; - } else if (n > 48 && n <= 56) { - return 56; - } else { - return 64; - } - } - - /** - * Finds the closest available fixed bit width match and returns its encoded - * value (ordinal) - * @param n - fixed bit width to encode - * @return encoded fixed bit width - */ - int encodeBitWidth(int n) { - n = getClosestFixedBits(n); - - if (n >= 1 && n <= 24) { - return n - 1; - } else if (n > 24 && n <= 26) { - return FixedBitSizes.TWENTYSIX.ordinal(); - } else if (n > 26 && n <= 28) { - return FixedBitSizes.TWENTYEIGHT.ordinal(); - } else if (n > 28 && n <= 30) { - return FixedBitSizes.THIRTY.ordinal(); - } else if (n > 30 && n <= 32) { - return FixedBitSizes.THIRTYTWO.ordinal(); - } else if (n > 32 && n <= 40) { - return FixedBitSizes.FORTY.ordinal(); - } else if (n > 40 && n <= 48) { - return FixedBitSizes.FORTYEIGHT.ordinal(); - } else if (n > 48 && n <= 56) { - return FixedBitSizes.FIFTYSIX.ordinal(); - } else { - return FixedBitSizes.SIXTYFOUR.ordinal(); - } - } - - /** - * Decodes the ordinal fixed bit value to actual fixed bit width value - * @param n - encoded fixed bit width - * @return decoded fixed bit width - */ - int decodeBitWidth(int n) { - if (n >= FixedBitSizes.ONE.ordinal() - && n <= FixedBitSizes.TWENTYFOUR.ordinal()) { - return n + 1; - } else if (n == FixedBitSizes.TWENTYSIX.ordinal()) { - return 26; - } else if (n == FixedBitSizes.TWENTYEIGHT.ordinal()) { - return 28; - } else if (n == FixedBitSizes.THIRTY.ordinal()) { - return 30; - } else if (n == FixedBitSizes.THIRTYTWO.ordinal()) { - return 32; - } else if (n == FixedBitSizes.FORTY.ordinal()) { - return 40; - } else if (n == FixedBitSizes.FORTYEIGHT.ordinal()) { - return 48; - } else if (n == FixedBitSizes.FIFTYSIX.ordinal()) { - return 56; - } else { - return 64; - } - } - - /** - * Bitpack and write the input values to underlying output stream - * @param input - values to write - * @param offset - offset - * @param len - length - * @param bitSize - bit width - * @param output - output stream - * @throws IOException - */ - void writeInts(long[] input, int offset, int len, int bitSize, - OutputStream output) throws IOException { - if (input == null || input.length < 1 || offset < 0 || len < 1 - || bitSize < 1) { - return; - } - - switch (bitSize) { - case 1: - unrolledBitPack1(input, offset, len, output); - return; - case 2: - unrolledBitPack2(input, offset, len, output); - return; - case 4: - unrolledBitPack4(input, offset, len, output); - return; - case 8: - unrolledBitPack8(input, offset, len, output); - return; - case 16: - unrolledBitPack16(input, offset, len, output); - return; - case 24: - unrolledBitPack24(input, offset, len, output); - return; - case 32: - unrolledBitPack32(input, offset, len, output); - return; - case 40: - unrolledBitPack40(input, offset, len, output); - return; - case 48: - unrolledBitPack48(input, offset, len, output); - return; - case 56: - unrolledBitPack56(input, offset, len, output); - return; - case 64: - unrolledBitPack64(input, offset, len, output); - return; - default: - break; - } - - int bitsLeft = 8; - byte current = 0; - for(int i = offset; i < (offset + len); i++) { - long value = input[i]; - int bitsToWrite = bitSize; - while (bitsToWrite > bitsLeft) { - // add the bits to the bottom of the current word - current |= value >>> (bitsToWrite - bitsLeft); - // subtract out the bits we just added - bitsToWrite -= bitsLeft; - // zero out the bits above bitsToWrite - value &= (1L << bitsToWrite) - 1; - output.write(current); - current = 0; - bitsLeft = 8; - } - bitsLeft -= bitsToWrite; - current |= value << bitsLeft; - if (bitsLeft == 0) { - output.write(current); - current = 0; - bitsLeft = 8; - } - } - - // flush - if (bitsLeft != 8) { - output.write(current); - current = 0; - bitsLeft = 8; - } - } - - private void unrolledBitPack1(long[] input, int offset, int len, - OutputStream output) throws IOException { - final int numHops = 8; - final int remainder = len % numHops; - final int endOffset = offset + len; - final int endUnroll = endOffset - remainder; - int val = 0; - for (int i = offset; i < endUnroll; i = i + numHops) { - val = (int) (val | ((input[i] & 1) << 7) - | ((input[i + 1] & 1) << 6) - | ((input[i + 2] & 1) << 5) - | ((input[i + 3] & 1) << 4) - | ((input[i + 4] & 1) << 3) - | ((input[i + 5] & 1) << 2) - | ((input[i + 6] & 1) << 1) - | (input[i + 7]) & 1); - output.write(val); - val = 0; - } - - if (remainder > 0) { - int startShift = 7; - for (int i = endUnroll; i < endOffset; i++) { - val = (int) (val | (input[i] & 1) << startShift); - startShift -= 1; - } - output.write(val); - } - } - - private void unrolledBitPack2(long[] input, int offset, int len, - OutputStream output) throws IOException { - final int numHops = 4; - final int remainder = len % numHops; - final int endOffset = offset + len; - final int endUnroll = endOffset - remainder; - int val = 0; - for (int i = offset; i < endUnroll; i = i + numHops) { - val = (int) (val | ((input[i] & 3) << 6) - | ((input[i + 1] & 3) << 4) - | ((input[i + 2] & 3) << 2) - | (input[i + 3]) & 3); - output.write(val); - val = 0; - } - - if (remainder > 0) { - int startShift = 6; - for (int i = endUnroll; i < endOffset; i++) { - val = (int) (val | (input[i] & 3) << startShift); - startShift -= 2; - } - output.write(val); - } - } - - private void unrolledBitPack4(long[] input, int offset, int len, - OutputStream output) throws IOException { - final int numHops = 2; - final int remainder = len % numHops; - final int endOffset = offset + len; - final int endUnroll = endOffset - remainder; - int val = 0; - for (int i = offset; i < endUnroll; i = i + numHops) { - val = (int) (val | ((input[i] & 15) << 4) | (input[i + 1]) & 15); - output.write(val); - val = 0; - } - - if (remainder > 0) { - int startShift = 4; - for (int i = endUnroll; i < endOffset; i++) { - val = (int) (val | (input[i] & 15) << startShift); - startShift -= 4; - } - output.write(val); - } - } - - private void unrolledBitPack8(long[] input, int offset, int len, - OutputStream output) throws IOException { - unrolledBitPackBytes(input, offset, len, output, 1); - } - - private void unrolledBitPack16(long[] input, int offset, int len, - OutputStream output) throws IOException { - unrolledBitPackBytes(input, offset, len, output, 2); - } - - private void unrolledBitPack24(long[] input, int offset, int len, - OutputStream output) throws IOException { - unrolledBitPackBytes(input, offset, len, output, 3); - } - - private void unrolledBitPack32(long[] input, int offset, int len, - OutputStream output) throws IOException { - unrolledBitPackBytes(input, offset, len, output, 4); - } - - private void unrolledBitPack40(long[] input, int offset, int len, - OutputStream output) throws IOException { - unrolledBitPackBytes(input, offset, len, output, 5); - } - - private void unrolledBitPack48(long[] input, int offset, int len, - OutputStream output) throws IOException { - unrolledBitPackBytes(input, offset, len, output, 6); - } - - private void unrolledBitPack56(long[] input, int offset, int len, - OutputStream output) throws IOException { - unrolledBitPackBytes(input, offset, len, output, 7); - } - - private void unrolledBitPack64(long[] input, int offset, int len, - OutputStream output) throws IOException { - unrolledBitPackBytes(input, offset, len, output, 8); - } - - private void unrolledBitPackBytes(long[] input, int offset, int len, OutputStream output, int numBytes) throws IOException { - final int numHops = 8; - final int remainder = len % numHops; - final int endOffset = offset + len; - final int endUnroll = endOffset - remainder; - int i = offset; - for (; i < endUnroll; i = i + numHops) { - writeLongBE(output, input, i, numHops, numBytes); - } - - if (remainder > 0) { - writeRemainingLongs(output, i, input, remainder, numBytes); - } - } - - private void writeRemainingLongs(OutputStream output, int offset, long[] input, int remainder, - int numBytes) throws IOException { - final int numHops = remainder; - - int idx = 0; - switch (numBytes) { - case 1: - while (remainder > 0) { - writeBuffer[idx] = (byte) (input[offset + idx] & 255); - remainder--; - idx++; - } - break; - case 2: - while (remainder > 0) { - writeLongBE2(output, input[offset + idx], idx * 2); - remainder--; - idx++; - } - break; - case 3: - while (remainder > 0) { - writeLongBE3(output, input[offset + idx], idx * 3); - remainder--; - idx++; - } - break; - case 4: - while (remainder > 0) { - writeLongBE4(output, input[offset + idx], idx * 4); - remainder--; - idx++; - } - break; - case 5: - while (remainder > 0) { - writeLongBE5(output, input[offset + idx], idx * 5); - remainder--; - idx++; - } - break; - case 6: - while (remainder > 0) { - writeLongBE6(output, input[offset + idx], idx * 6); - remainder--; - idx++; - } - break; - case 7: - while (remainder > 0) { - writeLongBE7(output, input[offset + idx], idx * 7); - remainder--; - idx++; - } - break; - case 8: - while (remainder > 0) { - writeLongBE8(output, input[offset + idx], idx * 8); - remainder--; - idx++; - } - break; - default: - break; - } - - final int toWrite = numHops * numBytes; - output.write(writeBuffer, 0, toWrite); - } - - private void writeLongBE(OutputStream output, long[] input, int offset, int numHops, int numBytes) throws IOException { - - switch (numBytes) { - case 1: - writeBuffer[0] = (byte) (input[offset + 0] & 255); - writeBuffer[1] = (byte) (input[offset + 1] & 255); - writeBuffer[2] = (byte) (input[offset + 2] & 255); - writeBuffer[3] = (byte) (input[offset + 3] & 255); - writeBuffer[4] = (byte) (input[offset + 4] & 255); - writeBuffer[5] = (byte) (input[offset + 5] & 255); - writeBuffer[6] = (byte) (input[offset + 6] & 255); - writeBuffer[7] = (byte) (input[offset + 7] & 255); - break; - case 2: - writeLongBE2(output, input[offset + 0], 0); - writeLongBE2(output, input[offset + 1], 2); - writeLongBE2(output, input[offset + 2], 4); - writeLongBE2(output, input[offset + 3], 6); - writeLongBE2(output, input[offset + 4], 8); - writeLongBE2(output, input[offset + 5], 10); - writeLongBE2(output, input[offset + 6], 12); - writeLongBE2(output, input[offset + 7], 14); - break; - case 3: - writeLongBE3(output, input[offset + 0], 0); - writeLongBE3(output, input[offset + 1], 3); - writeLongBE3(output, input[offset + 2], 6); - writeLongBE3(output, input[offset + 3], 9); - writeLongBE3(output, input[offset + 4], 12); - writeLongBE3(output, input[offset + 5], 15); - writeLongBE3(output, input[offset + 6], 18); - writeLongBE3(output, input[offset + 7], 21); - break; - case 4: - writeLongBE4(output, input[offset + 0], 0); - writeLongBE4(output, input[offset + 1], 4); - writeLongBE4(output, input[offset + 2], 8); - writeLongBE4(output, input[offset + 3], 12); - writeLongBE4(output, input[offset + 4], 16); - writeLongBE4(output, input[offset + 5], 20); - writeLongBE4(output, input[offset + 6], 24); - writeLongBE4(output, input[offset + 7], 28); - break; - case 5: - writeLongBE5(output, input[offset + 0], 0); - writeLongBE5(output, input[offset + 1], 5); - writeLongBE5(output, input[offset + 2], 10); - writeLongBE5(output, input[offset + 3], 15); - writeLongBE5(output, input[offset + 4], 20); - writeLongBE5(output, input[offset + 5], 25); - writeLongBE5(output, input[offset + 6], 30); - writeLongBE5(output, input[offset + 7], 35); - break; - case 6: - writeLongBE6(output, input[offset + 0], 0); - writeLongBE6(output, input[offset + 1], 6); - writeLongBE6(output, input[offset + 2], 12); - writeLongBE6(output, input[offset + 3], 18); - writeLongBE6(output, input[offset + 4], 24); - writeLongBE6(output, input[offset + 5], 30); - writeLongBE6(output, input[offset + 6], 36); - writeLongBE6(output, input[offset + 7], 42); - break; - case 7: - writeLongBE7(output, input[offset + 0], 0); - writeLongBE7(output, input[offset + 1], 7); - writeLongBE7(output, input[offset + 2], 14); - writeLongBE7(output, input[offset + 3], 21); - writeLongBE7(output, input[offset + 4], 28); - writeLongBE7(output, input[offset + 5], 35); - writeLongBE7(output, input[offset + 6], 42); - writeLongBE7(output, input[offset + 7], 49); - break; - case 8: - writeLongBE8(output, input[offset + 0], 0); - writeLongBE8(output, input[offset + 1], 8); - writeLongBE8(output, input[offset + 2], 16); - writeLongBE8(output, input[offset + 3], 24); - writeLongBE8(output, input[offset + 4], 32); - writeLongBE8(output, input[offset + 5], 40); - writeLongBE8(output, input[offset + 6], 48); - writeLongBE8(output, input[offset + 7], 56); - break; - default: - break; - } - - final int toWrite = numHops * numBytes; - output.write(writeBuffer, 0, toWrite); - } - - private void writeLongBE2(OutputStream output, long val, int wbOffset) { - writeBuffer[wbOffset + 0] = (byte) (val >>> 8); - writeBuffer[wbOffset + 1] = (byte) (val >>> 0); - } - - private void writeLongBE3(OutputStream output, long val, int wbOffset) { - writeBuffer[wbOffset + 0] = (byte) (val >>> 16); - writeBuffer[wbOffset + 1] = (byte) (val >>> 8); - writeBuffer[wbOffset + 2] = (byte) (val >>> 0); - } - - private void writeLongBE4(OutputStream output, long val, int wbOffset) { - writeBuffer[wbOffset + 0] = (byte) (val >>> 24); - writeBuffer[wbOffset + 1] = (byte) (val >>> 16); - writeBuffer[wbOffset + 2] = (byte) (val >>> 8); - writeBuffer[wbOffset + 3] = (byte) (val >>> 0); - } - - private void writeLongBE5(OutputStream output, long val, int wbOffset) { - writeBuffer[wbOffset + 0] = (byte) (val >>> 32); - writeBuffer[wbOffset + 1] = (byte) (val >>> 24); - writeBuffer[wbOffset + 2] = (byte) (val >>> 16); - writeBuffer[wbOffset + 3] = (byte) (val >>> 8); - writeBuffer[wbOffset + 4] = (byte) (val >>> 0); - } - - private void writeLongBE6(OutputStream output, long val, int wbOffset) { - writeBuffer[wbOffset + 0] = (byte) (val >>> 40); - writeBuffer[wbOffset + 1] = (byte) (val >>> 32); - writeBuffer[wbOffset + 2] = (byte) (val >>> 24); - writeBuffer[wbOffset + 3] = (byte) (val >>> 16); - writeBuffer[wbOffset + 4] = (byte) (val >>> 8); - writeBuffer[wbOffset + 5] = (byte) (val >>> 0); - } - - private void writeLongBE7(OutputStream output, long val, int wbOffset) { - writeBuffer[wbOffset + 0] = (byte) (val >>> 48); - writeBuffer[wbOffset + 1] = (byte) (val >>> 40); - writeBuffer[wbOffset + 2] = (byte) (val >>> 32); - writeBuffer[wbOffset + 3] = (byte) (val >>> 24); - writeBuffer[wbOffset + 4] = (byte) (val >>> 16); - writeBuffer[wbOffset + 5] = (byte) (val >>> 8); - writeBuffer[wbOffset + 6] = (byte) (val >>> 0); - } - - private void writeLongBE8(OutputStream output, long val, int wbOffset) { - writeBuffer[wbOffset + 0] = (byte) (val >>> 56); - writeBuffer[wbOffset + 1] = (byte) (val >>> 48); - writeBuffer[wbOffset + 2] = (byte) (val >>> 40); - writeBuffer[wbOffset + 3] = (byte) (val >>> 32); - writeBuffer[wbOffset + 4] = (byte) (val >>> 24); - writeBuffer[wbOffset + 5] = (byte) (val >>> 16); - writeBuffer[wbOffset + 6] = (byte) (val >>> 8); - writeBuffer[wbOffset + 7] = (byte) (val >>> 0); - } - - // Do not want to use Guava LongMath.checkedSubtract() here as it will throw - // ArithmeticException in case of overflow - public boolean isSafeSubtract(long left, long right) { - return (left ^ right) >= 0 | (left ^ (left - right)) >= 0; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SnappyCodec.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SnappyCodec.java deleted file mode 100644 index 285a32aeb8..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SnappyCodec.java +++ /dev/null @@ -1,109 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.thirdparty.orc; - -import org.apache.hadoop.hive.shims.HadoopShims.DirectCompressionType; -import org.apache.hadoop.hive.shims.HadoopShims.DirectDecompressorShim; -import org.apache.hadoop.hive.shims.ShimLoader; -import org.iq80.snappy.Snappy; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.EnumSet; - -class SnappyCodec implements CompressionCodec, DirectDecompressionCodec { - - Boolean direct = null; - - @Override - public boolean compress(ByteBuffer in, ByteBuffer out, - ByteBuffer overflow) throws IOException { - int inBytes = in.remaining(); - // I should work on a patch for Snappy to support an overflow buffer - // to prevent the extra buffer copy. - byte[] compressed = new byte[Snappy.maxCompressedLength(inBytes)]; - int outBytes = - Snappy.compress(in.array(), in.arrayOffset() + in.position(), inBytes, - compressed, 0); - if (outBytes < inBytes) { - int remaining = out.remaining(); - if (remaining >= outBytes) { - System.arraycopy(compressed, 0, out.array(), out.arrayOffset() + - out.position(), outBytes); - out.position(out.position() + outBytes); - } else { - System.arraycopy(compressed, 0, out.array(), out.arrayOffset() + - out.position(), remaining); - out.position(out.limit()); - System.arraycopy(compressed, remaining, overflow.array(), - overflow.arrayOffset(), outBytes - remaining); - overflow.position(outBytes - remaining); - } - return true; - } else { - return false; - } - } - - @Override - public void decompress(ByteBuffer in, ByteBuffer out) throws IOException { - if(in.isDirect() && out.isDirect()) { - directDecompress(in, out); - return; - } - int inOffset = in.position(); - int uncompressLen = - Snappy.uncompress(in.array(), in.arrayOffset() + inOffset, - in.limit() - inOffset, out.array(), out.arrayOffset() + out.position()); - out.position(uncompressLen + out.position()); - out.flip(); - } - - @Override - public boolean isAvailable() { - if (direct == null) { - try { - if (ShimLoader.getHadoopShims().getDirectDecompressor( - DirectCompressionType.SNAPPY) != null) { - direct = Boolean.valueOf(true); - } else { - direct = Boolean.valueOf(false); - } - } catch (UnsatisfiedLinkError ule) { - direct = Boolean.valueOf(false); - } - } - return direct.booleanValue(); - } - - @Override - public void directDecompress(ByteBuffer in, ByteBuffer out) - throws IOException { - DirectDecompressorShim decompressShim = ShimLoader.getHadoopShims() - .getDirectDecompressor(DirectCompressionType.SNAPPY); - decompressShim.decompress(in, out); - out.flip(); // flip for read - } - - @Override - public CompressionCodec modify(EnumSet modifiers) { - // snappy allows no modifications - return this; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamName.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamName.java deleted file mode 100644 index 382164530c..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamName.java +++ /dev/null @@ -1,95 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.thirdparty.orc; - -/** - * The name of a stream within a stripe. - */ -class StreamName implements Comparable { - private final int column; - private final OrcProto.Stream.Kind kind; - - public enum Area { - DATA, INDEX - } - - public StreamName(int column, OrcProto.Stream.Kind kind) { - this.column = column; - this.kind = kind; - } - - public boolean equals(Object obj) { - if (obj != null && obj instanceof StreamName) { - StreamName other = (StreamName) obj; - return other.column == column && other.kind == kind; - } else { - return false; - } - } - - @Override - public int compareTo(StreamName streamName) { - if (streamName == null) { - return -1; - } - Area area = getArea(kind); - Area otherArea = StreamName.getArea(streamName.kind); - if (area != otherArea) { - return -area.compareTo(otherArea); - } - if (column != streamName.column) { - return column < streamName.column ? -1 : 1; - } - return kind.compareTo(streamName.kind); - } - - public int getColumn() { - return column; - } - - public OrcProto.Stream.Kind getKind() { - return kind; - } - - public Area getArea() { - return getArea(kind); - } - - public static Area getArea(OrcProto.Stream.Kind kind) { - switch (kind) { - case ROW_INDEX: - case DICTIONARY_COUNT: - case BLOOM_FILTER: - return Area.INDEX; - default: - return Area.DATA; - } - } - - @Override - public String toString() { - return "Stream for column " + column + " kind " + kind; - } - - @Override - public int hashCode() { - return column * 101 + kind.getNumber(); - } -} - diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringColumnStatistics.java deleted file mode 100644 index 42486646bf..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringColumnStatistics.java +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -/** - * Statistics for string columns. - */ -public interface StringColumnStatistics extends ColumnStatistics { - /** - * Get the minimum string. - * @return the minimum - */ - String getMinimum(); - - /** - * Get the maximum string. - * @return the maximum - */ - String getMaximum(); - - /** - * Get the total length of all strings - * @return the sum (total length) - */ - long getSum(); -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringRedBlackTree.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringRedBlackTree.java deleted file mode 100644 index 8835cefa5e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringRedBlackTree.java +++ /dev/null @@ -1,202 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import org.apache.hadoop.io.Text; - -import java.io.IOException; -import java.io.OutputStream; - -/** - * A red-black tree that stores strings. The strings are stored as UTF-8 bytes - * and an offset for each entry. - */ -class StringRedBlackTree extends RedBlackTree { - private final DynamicByteArray byteArray = new DynamicByteArray(); - private final DynamicIntArray keyOffsets; - private String newKey; - - public StringRedBlackTree(int initialCapacity) { - super(initialCapacity); - keyOffsets = new DynamicIntArray(initialCapacity); - } - - public int add(String value) { - newKey = value; - return addNewKey(); - } - - private int addNewKey() { - // if the newKey is actually new, add it to our byteArray and store the offset & length - if (add()) { - int len = newKey.length(); - keyOffsets.add(byteArray.add(newKey.getBytes(), 0, len)); - } - return lastAdd; - } - - public int add(Text value) { - newKey = value.toString(); - return addNewKey(); - } - - @Override - protected int compareValue(int position) { - int start = keyOffsets.get(position); - int end; - if (position + 1 == keyOffsets.size()) { - end = byteArray.size(); - } else { - end = keyOffsets.get(position+1); - } - return byteArray.compare(newKey.getBytes(), 0, newKey.length(), - start, end - start); - } - - /** - * The information about each node. - */ - public interface VisitorContext { - /** - * Get the position where the key was originally added. - * @return the number returned by add. - */ - int getOriginalPosition(); - - /** - * Write the bytes for the string to the given output stream. - * @param out the stream to write to. - * @throws IOException - */ - void writeBytes(OutputStream out) throws IOException; - - /** - * Get the original string. - * @return the string - */ - Text getText(); - - /** - * Get the number of bytes. - * @return the string's length in bytes - */ - int getLength(); - } - - /** - * The interface for visitors. - */ - public interface Visitor { - /** - * Called once for each node of the tree in sort order. - * @param context the information about each node - * @throws IOException - */ - void visit(VisitorContext context) throws IOException; - } - - private class VisitorContextImpl implements VisitorContext { - private int originalPosition; - private int start; - private int end; - private final Text text = new Text(); - - public int getOriginalPosition() { - return originalPosition; - } - - public Text getText() { - byteArray.setText(text, start, end - start); - return text; - } - - public void writeBytes(OutputStream out) throws IOException { - byteArray.write(out, start, end - start); - } - - public int getLength() { - return end - start; - } - - void setPosition(int position) { - originalPosition = position; - start = keyOffsets.get(originalPosition); - if (position + 1 == keyOffsets.size()) { - end = byteArray.size(); - } else { - end = keyOffsets.get(originalPosition + 1); - } - } - } - - private void recurse(int node, Visitor visitor, VisitorContextImpl context - ) throws IOException { - if (node != NULL) { - recurse(getLeft(node), visitor, context); - context.setPosition(node); - visitor.visit(context); - recurse(getRight(node), visitor, context); - } - } - - /** - * Visit all of the nodes in the tree in sorted order. - * @param visitor the action to be applied to each node - * @throws IOException - */ - public void visit(Visitor visitor) throws IOException { - recurse(root, visitor, new VisitorContextImpl()); - } - - /** - * Reset the table to empty. - */ - public void clear() { - super.clear(); - byteArray.clear(); - keyOffsets.clear(); - } - - public void getText(Text result, int originalPosition) { - int offset = keyOffsets.get(originalPosition); - int length; - if (originalPosition + 1 == keyOffsets.size()) { - length = byteArray.size() - offset; - } else { - length = keyOffsets.get(originalPosition + 1) - offset; - } - byteArray.setText(result, offset, length); - } - - /** - * Get the size of the character data in the table. - * @return the bytes used by the table - */ - public int getCharacterSize() { - return byteArray.size(); - } - - /** - * Calculate the approximate size in memory. - * @return the number of bytes used in storing the tree. - */ - public long getSizeInBytes() { - return byteArray.getSizeInBytes() + keyOffsets.getSizeInBytes() + - super.getSizeInBytes(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeInformation.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeInformation.java deleted file mode 100644 index 62819c1a22..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeInformation.java +++ /dev/null @@ -1,59 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -/** - * Information about the stripes in an ORC file that is provided by the Reader. - */ -public interface StripeInformation { - /** - * Get the byte offset of the start of the stripe. - * @return the bytes from the start of the file - */ - long getOffset(); - - /** - * Get the total length of the stripe in bytes. - * @return the number of bytes in the stripe - */ - long getLength(); - - /** - * Get the length of the stripe's indexes. - * @return the number of bytes in the index - */ - long getIndexLength(); - - /** - * Get the length of the stripe's data. - * @return the number of bytes in the stripe - */ - long getDataLength(); - - /** - * Get the length of the stripe's tail section, which contains its index. - * @return the number of bytes in the tail - */ - long getFooterLength(); - - /** - * Get the number of rows in the stripe. - * @return a count of the number of rows - */ - long getNumberOfRows(); -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeStatistics.java deleted file mode 100644 index 013fc8ec80..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeStatistics.java +++ /dev/null @@ -1,42 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.thirdparty.orc; - -import java.util.List; - -public class StripeStatistics { - private final List cs; - - StripeStatistics(List list) { - this.cs = list; - } - - /** - * Return list of column statistics - * - * @return column stats - */ - public ColumnStatistics[] getColumnStatistics() { - ColumnStatistics[] result = new ColumnStatistics[cs.size()]; - for (int i = 0; i < result.length; ++i) { - result[i] = ColumnStatisticsImpl.deserialize(cs.get(i)); - } - return result; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TimestampColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TimestampColumnStatistics.java deleted file mode 100644 index 6fad0ac1fe..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TimestampColumnStatistics.java +++ /dev/null @@ -1,38 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.thirdparty.orc; - -import java.sql.Timestamp; - -/** - * Statistics for Timestamp columns. - */ -public interface TimestampColumnStatistics extends ColumnStatistics { - /** - * Get the minimum value for the column. - * @return minimum value - */ - Timestamp getMinimum(); - - /** - * Get the maximum value for the column. - * @return maximum value - */ - Timestamp getMaximum(); -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java index c1781ef6a6..136e5a7b5d 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java @@ -39,6 +39,7 @@ import org.apache.tajo.exception.UnsupportedException; import org.apache.tajo.storage.StorageConstants; import org.apache.tajo.unit.TimeUnit; +import org.apache.tajo.util.datetime.DateTimeConstants; import org.apache.tajo.util.datetime.DateTimeUtil; import java.io.EOFException; @@ -889,7 +890,7 @@ private static int parseNanos(long serialized) { // borrowed from Facebook's TimestampStreamReader private static long decodeTimestamp(long seconds, long serializedNanos, long baseTimestampInSeconds) { - long millis = (seconds + baseTimestampInSeconds) * TimeUnit.MILLIS_PER_SECOND; + long millis = (seconds + baseTimestampInSeconds) * DateTimeConstants.MSECS_PER_SEC; long nanos = parseNanos(serializedNanos); // the rounding error exists because java always rounds up when dividing integers diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java index 669b44fbd3..2c85aa6653 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java @@ -18,6 +18,8 @@ package org.apache.tajo.storage.thirdparty.orc; +import org.apache.orc.OrcProto; +import org.apache.orc.StripeInformation; import org.apache.tajo.storage.Tuple; import java.io.IOException; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java index 4cf008a3a9..032885dece 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java @@ -19,7 +19,6 @@ package org.apache.tajo.storage.thirdparty.orc; import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Joiner; import com.google.common.collect.Lists; import com.google.common.primitives.Longs; import com.google.protobuf.ByteString; @@ -30,22 +29,20 @@ import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.io.IOConstants; -import org.apache.hadoop.hive.shims.ShimLoader; -import org.apache.tajo.datum.*; -import org.apache.tajo.storage.Tuple; -import org.apache.tajo.storage.thirdparty.orc.CompressionCodec.Modifier; -import org.apache.tajo.storage.thirdparty.orc.OrcProto.RowIndexEntry; -import org.apache.tajo.storage.thirdparty.orc.OrcProto.StripeStatistics; -import org.apache.tajo.storage.thirdparty.orc.OrcProto.Type; -import org.apache.tajo.storage.thirdparty.orc.OrcProto.UserMetadataItem; import org.apache.hadoop.hive.ql.util.JavaDataModel; -import org.apache.hadoop.hive.serde2.objectinspector.*; -import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.Text; -import org.apache.tajo.unit.TimeUnit; +import org.apache.orc.*; +import org.apache.orc.CompressionCodec.Modifier; +import org.apache.orc.OrcProto.RowIndexEntry; +import org.apache.orc.OrcUtils; +import org.apache.orc.impl.*; +import org.apache.tajo.datum.Datum; +import org.apache.tajo.datum.Inet4Datum; +import org.apache.tajo.datum.Int4Datum; +import org.apache.tajo.datum.Int8Datum; +import org.apache.tajo.storage.Tuple; +import org.apache.tajo.storage.thirdparty.orc.OrcFile.*; +import org.apache.tajo.util.datetime.DateTimeConstants; import org.apache.tajo.util.datetime.DateTimeUtil; import java.io.IOException; @@ -95,10 +92,11 @@ public class WriterImpl implements Writer, MemoryManager.Callback { private final boolean addBlockPadding; private final int bufferSize; private final long blockSize; - private final float paddingTolerance; + private final double paddingTolerance; + private final TypeDescription schema; + // the streams that make up the current stripe - private final Map streams = - new TreeMap<>(); + private final Map streams = new TreeMap<>(); private FSDataOutputStream rawWriter = null; // the compressed metadata information outStream @@ -112,47 +110,32 @@ public class WriterImpl implements Writer, MemoryManager.Callback { private long rawDataSize = 0; private int rowsInIndex = 0; private int stripesAtLastFlush = -1; - private final List stripes = - new ArrayList<>(); - private final Map userMetadata = - new TreeMap<>(); + private final List stripes = new ArrayList<>(); + private final Map userMetadata = new TreeMap<>(); + private final StreamFactory streamFactory = new StreamFactory(); private final TreeWriter treeWriter; private final boolean buildIndex; private final MemoryManager memoryManager; - private final OrcFile.Version version; + private final Version version; private final Configuration conf; - private final OrcFile.WriterCallback callback; - private final OrcFile.WriterContext callbackContext; - private final OrcFile.EncodingStrategy encodingStrategy; - private final OrcFile.CompressionStrategy compressionStrategy; + private final WriterCallback callback; + private final WriterContext callbackContext; + private final EncodingStrategy encodingStrategy; + private final CompressionStrategy compressionStrategy; private final boolean[] bloomFilterColumns; private final double bloomFilterFpp; private boolean writeTimeZone; private TimeZone timeZone; - WriterImpl(FileSystem fs, - Path path, - Configuration conf, - ObjectInspector inspector, - long stripeSize, - CompressionKind compress, - int bufferSize, - int rowIndexStride, - MemoryManager memoryManager, - boolean addBlockPadding, - OrcFile.Version version, - OrcFile.WriterCallback callback, - OrcFile.EncodingStrategy encodingStrategy, - OrcFile.CompressionStrategy compressionStrategy, - float paddingTolerance, - long blockSizeValue, - String bloomFilterColumnNames, - double bloomFilterFpp, - TimeZone timeZone) throws IOException { + public WriterImpl(FileSystem fs, + Path path, + OrcFile.WriterOptions opts, + TimeZone timeZone) throws IOException { this.fs = fs; this.path = path; - this.conf = conf; - this.callback = callback; + this.conf = opts.getConfiguration(); + this.callback = opts.getCallback(); + this.schema = opts.getSchema(); if (callback != null) { callbackContext = new OrcFile.WriterContext(){ @@ -164,100 +147,60 @@ public Writer getWriter() { } else { callbackContext = null; } - this.adjustedStripeSize = stripeSize; - this.defaultStripeSize = stripeSize; - this.version = version; - this.encodingStrategy = encodingStrategy; - this.compressionStrategy = compressionStrategy; - this.addBlockPadding = addBlockPadding; - this.blockSize = blockSizeValue; - this.paddingTolerance = paddingTolerance; - this.compress = compress; - this.rowIndexStride = rowIndexStride; - this.memoryManager = memoryManager; - this.timeZone = timeZone; + this.adjustedStripeSize = opts.getStripeSize(); + this.defaultStripeSize = opts.getStripeSize(); + this.version = opts.getVersion(); + this.encodingStrategy = opts.getEncodingStrategy(); + this.compressionStrategy = opts.getCompressionStrategy(); + this.addBlockPadding = opts.getBlockPadding(); + this.blockSize = opts.getBlockSize(); + this.paddingTolerance = opts.getPaddingTolerance(); + this.compress = opts.getCompress(); + this.rowIndexStride = opts.getRowIndexStride(); + this.memoryManager = opts.getMemoryManager(); buildIndex = rowIndexStride > 0; codec = createCodec(compress); - String allColumns = conf.get(IOConstants.COLUMNS); - if (allColumns == null) { - allColumns = getColumnNamesFromInspector(inspector); - } - this.bufferSize = getEstimatedBufferSize(allColumns, bufferSize); + int numColumns = schema.getMaximumId() + 1; + this.bufferSize = getEstimatedBufferSize(defaultStripeSize, + numColumns, opts.getBufferSize()); if (version == OrcFile.Version.V_0_11) { /* do not write bloom filters for ORC v11 */ - this.bloomFilterColumns = - OrcUtils.includeColumns(null, allColumns, inspector); + this.bloomFilterColumns = new boolean[schema.getMaximumId() + 1]; } else { this.bloomFilterColumns = - OrcUtils.includeColumns(bloomFilterColumnNames, allColumns, inspector); + OrcUtils.includeColumns(opts.getBloomFilterColumns(), schema); } - this.bloomFilterFpp = bloomFilterFpp; - treeWriter = createTreeWriter(inspector, new StreamFactory(), false); + this.bloomFilterFpp = opts.getBloomFilterFpp(); + this.timeZone = timeZone; + treeWriter = createTreeWriter(schema, streamFactory, false); if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) { throw new IllegalArgumentException("Row stride must be at least " + MIN_ROW_INDEX_STRIDE); } // ensure that we are able to handle callbacks before we register ourselves - memoryManager.addWriter(path, stripeSize, this); - } - - private String getColumnNamesFromInspector(ObjectInspector inspector) { - List fieldNames = Lists.newArrayList(); - Joiner joiner = Joiner.on(","); - if (inspector instanceof StructObjectInspector) { - StructObjectInspector soi = (StructObjectInspector) inspector; - List fields = soi.getAllStructFieldRefs(); - for(StructField sf : fields) { - fieldNames.add(sf.getFieldName()); - } - } - return joiner.join(fieldNames); + memoryManager.addWriter(path, opts.getStripeSize(), this); } @VisibleForTesting - int getEstimatedBufferSize(int bs) { - return getEstimatedBufferSize(conf.get(IOConstants.COLUMNS), bs); - } - - int getEstimatedBufferSize(String colNames, int bs) { - long availableMem = getMemoryAvailableForORC(); - if (colNames != null) { - final int numCols = colNames.split(",").length; - if (numCols > COLUMN_COUNT_THRESHOLD) { - // In BufferedStream, there are 3 outstream buffers (compressed, - // uncompressed and overflow) and list of previously compressed buffers. - // Since overflow buffer is rarely used, lets consider only 2 allocation. - // Also, initially, the list of compression buffers will be empty. - final int outStreamBuffers = codec == null ? 1 : 2; - - // max possible streams per column is 5. For string columns, there is - // ROW_INDEX, PRESENT, DATA, LENGTH, DICTIONARY_DATA streams. - final int maxStreams = 5; - - // Lets assume 10% memory for holding dictionary in memory and other - // object allocations - final long miscAllocation = (long) (0.1f * availableMem); - - // compute the available memory - final long remainingMem = availableMem - miscAllocation; - - int estBufferSize = (int) (remainingMem / - (maxStreams * outStreamBuffers * numCols)); - estBufferSize = getClosestBufferSize(estBufferSize, bs); - if (estBufferSize > bs) { - estBufferSize = bs; - } - - LOG.info("WIDE TABLE - Number of columns: " + numCols + - " Chosen compression buffer size: " + estBufferSize); - return estBufferSize; - } + public static int getEstimatedBufferSize(long stripeSize, int numColumns, + int bs) { + // The worst case is that there are 2 big streams per a column and + // we want to guarantee that each stream gets ~10 buffers. + // This keeps buffers small enough that we don't get really small stripe + // sizes. + int estBufferSize = (int) (stripeSize / (20 * numColumns)); + estBufferSize = getClosestBufferSize(estBufferSize); + if (estBufferSize > bs) { + estBufferSize = bs; + } else { + LOG.info("WIDE TABLE - Number of columns: " + numColumns + + " Chosen compression buffer size: " + estBufferSize); } - return bs; + return estBufferSize; } - private int getClosestBufferSize(int estBufferSize, int bs) { + private static int getClosestBufferSize(int estBufferSize) { final int kb4 = 4 * 1024; final int kb8 = 8 * 1024; final int kb16 = 16 * 1024; @@ -617,8 +560,7 @@ public TimeZone getTimeZone() { */ private abstract static class TreeWriter { protected final int id; - protected final ObjectInspector inspector; - private final BitFieldWriter isPresent; + protected final BitFieldWriter isPresent; private final boolean isCompressed; protected final ColumnStatisticsImpl indexStatistics; protected final ColumnStatisticsImpl stripeColStatistics; @@ -635,24 +577,24 @@ private abstract static class TreeWriter { private final OrcProto.BloomFilter.Builder bloomFilterEntry; private boolean foundNulls; private OutStream isPresentOutStream; - private final List stripeStatsBuilders; + private final List stripeStatsBuilders; private final StreamFactory streamFactory; /** * Create a tree writer. * @param columnId the column id of the column to write - * @param inspector the object inspector to use + * @param schema the row schema * @param streamFactory limited access to the Writer's data. * @param nullable can the value be null? * @throws IOException */ - TreeWriter(int columnId, ObjectInspector inspector, + TreeWriter(int columnId, + TypeDescription schema, StreamFactory streamFactory, boolean nullable) throws IOException { this.streamFactory = streamFactory; this.isCompressed = streamFactory.isCompressed(); this.id = columnId; - this.inspector = inspector; if (nullable) { isPresentOutStream = streamFactory.createStream(id, OrcProto.Stream.Kind.PRESENT); @@ -662,9 +604,9 @@ private abstract static class TreeWriter { } this.foundNulls = false; createBloomFilter = streamFactory.getBloomFilterColumns()[columnId]; - indexStatistics = ColumnStatisticsImpl.create(inspector); - stripeColStatistics = ColumnStatisticsImpl.create(inspector); - fileStatistics = ColumnStatisticsImpl.create(inspector); + indexStatistics = ColumnStatisticsImpl.create(schema); + stripeColStatistics = ColumnStatisticsImpl.create(schema); + fileStatistics = ColumnStatisticsImpl.create(schema); childrenWriters = new TreeWriter[0]; rowIndex = OrcProto.RowIndex.newBuilder(); rowIndexEntry = OrcProto.RowIndexEntry.newBuilder(); @@ -913,10 +855,10 @@ private static class BooleanTreeWriter extends TreeWriter { private final BitFieldWriter writer; BooleanTreeWriter(int columnId, - ObjectInspector inspector, + TypeDescription schema, StreamFactory writer, boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + super(columnId, schema, writer, nullable); PositionedOutputStream out = writer.createStream(id, OrcProto.Stream.Kind.DATA); this.writer = new BitFieldWriter(out, 1); @@ -928,7 +870,8 @@ void write(Datum datum) throws IOException { super.write(datum); if (datum != null && datum.isNotNull()) { boolean val = datum.asBool(); - indexStatistics.updateBoolean(val); + // TODO: validate the below line + indexStatistics.updateBoolean(val, 1); writer.write(val ? 1 : 0); } } @@ -952,10 +895,10 @@ private static class ByteTreeWriter extends TreeWriter { private final RunLengthByteWriter writer; ByteTreeWriter(int columnId, - ObjectInspector inspector, - StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + TypeDescription schema, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, schema, writer, nullable); this.writer = new RunLengthByteWriter(writer.createStream(id, OrcProto.Stream.Kind.DATA)); recordPosition(rowIndexPosition); @@ -966,7 +909,7 @@ void write(Datum datum) throws IOException { super.write(datum); if (datum != null && datum.isNotNull()) { byte val = datum.asByte(); - indexStatistics.updateInteger(val); + indexStatistics.updateInteger(val, 1); if (createBloomFilter) { bloomFilter.addLong(val); } @@ -994,10 +937,10 @@ private static class IntegerTreeWriter extends TreeWriter { private boolean isDirectV2 = true; IntegerTreeWriter(int columnId, - ObjectInspector inspector, + TypeDescription schema, StreamFactory writer, boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + super(columnId, schema, writer, nullable); OutStream out = writer.createStream(id, OrcProto.Stream.Kind.DATA); this.isDirectV2 = isNewWriteFormat(writer); @@ -1027,7 +970,7 @@ void write(Datum datum) throws IOException { } else { val = datum.asInt2(); } - indexStatistics.updateInteger(val); + indexStatistics.updateInteger(val, 1); if (createBloomFilter) { // integers are converted to longs in column statistics and during SARG evaluation bloomFilter.addLong(val); @@ -1056,10 +999,10 @@ private static class FloatTreeWriter extends TreeWriter { private final SerializationUtils utils; FloatTreeWriter(int columnId, - ObjectInspector inspector, - StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + TypeDescription schema, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, schema, writer, nullable); this.stream = writer.createStream(id, OrcProto.Stream.Kind.DATA); this.utils = new SerializationUtils(); @@ -1100,10 +1043,10 @@ private static class DoubleTreeWriter extends TreeWriter { private final SerializationUtils utils; DoubleTreeWriter(int columnId, - ObjectInspector inspector, - StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + TypeDescription schema, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, schema, writer, nullable); this.stream = writer.createStream(id, OrcProto.Stream.Kind.DATA); this.utils = new SerializationUtils(); @@ -1138,33 +1081,33 @@ void recordPosition(PositionRecorder recorder) throws IOException { } } - private static class StringTreeWriter extends TreeWriter { + private static abstract class StringBaseTreeWriter extends TreeWriter { private static final int INITIAL_DICTIONARY_SIZE = 4096; private final OutStream stringOutput; private final IntegerWriter lengthOutput; private final IntegerWriter rowOutput; - private final StringRedBlackTree dictionary = + protected final StringRedBlackTree dictionary = new StringRedBlackTree(INITIAL_DICTIONARY_SIZE); - private final DynamicIntArray rows = new DynamicIntArray(); - private final PositionedOutputStream directStreamOutput; - private final IntegerWriter directLengthOutput; - private final List savedRowIndex = - new ArrayList<>(); + protected final DynamicIntArray rows = new DynamicIntArray(); + protected final PositionedOutputStream directStreamOutput; + protected final IntegerWriter directLengthOutput; + private final List savedRowIndex = + new ArrayList(); private final boolean buildIndex; - private final List rowIndexValueCount = new ArrayList<>(); + private final List rowIndexValueCount = new ArrayList(); // If the number of keys in a dictionary is greater than this fraction of //the total number of non-null rows, turn off dictionary encoding - private final float dictionaryKeySizeThreshold; - private boolean useDictionaryEncoding = true; + private final double dictionaryKeySizeThreshold; + protected boolean useDictionaryEncoding = true; private boolean isDirectV2 = true; private boolean doneDictionaryCheck; - private final boolean strideDictionaryCheck; + protected final boolean strideDictionaryCheck; - StringTreeWriter(int columnId, - ObjectInspector inspector, - StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + StringBaseTreeWriter(int columnId, + TypeDescription schema, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, schema, writer, nullable); this.isDirectV2 = isNewWriteFormat(writer); stringOutput = writer.createStream(id, OrcProto.Stream.Kind.DICTIONARY_DATA); @@ -1178,33 +1121,14 @@ private static class StringTreeWriter extends TreeWriter { directStreamOutput = writer.createStream(id, OrcProto.Stream.Kind.DATA); directLengthOutput = createIntegerWriter(writer.createStream(id, OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer); - dictionaryKeySizeThreshold = writer.getConfiguration().getFloat( - OrcConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname, - OrcConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.defaultFloatVal); - strideDictionaryCheck = writer.getConfiguration().getBoolean( - OrcConf.ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname, - OrcConf.ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.defaultBoolVal); + Configuration conf = writer.getConfiguration(); + dictionaryKeySizeThreshold = + org.apache.orc.OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getDouble(conf); + strideDictionaryCheck = + org.apache.orc.OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getBoolean(conf); doneDictionaryCheck = false; } - @Override - void write(Datum datum) throws IOException { - super.write(datum); - if (datum != null && datum.isNotNull()) { - if (useDictionaryEncoding || !strideDictionaryCheck) { - rows.add(dictionary.add(datum.toString())); - } else { - // write data and length - directStreamOutput.write(datum.asByteArray(), 0, datum.size()); - directLengthOutput.write(datum.size()); - } - indexStatistics.updateString(datum.toString()); - if (createBloomFilter) { - bloomFilter.addBytes(datum.asByteArray(), datum.size()); - } - } - } - private boolean checkDictionaryEncoding() { if (!doneDictionaryCheck) { // Set the flag indicating whether or not to use dictionary encoding @@ -1270,7 +1194,7 @@ private void flushDictionary() throws IOException { private int currentId = 0; @Override public void visit(StringRedBlackTree.VisitorContext context - ) throws IOException { + ) throws IOException { context.writeBytes(stringOutput); lengthOutput.write(context.getLength()); dumpOrder[context.getOriginalPosition()] = currentId++; @@ -1384,29 +1308,76 @@ long estimateMemory() { } } + private static class StringTreeWriter extends StringBaseTreeWriter { + StringTreeWriter(int columnId, + TypeDescription schema, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, schema, writer, nullable); + } + + @Override + void write(Datum datum) throws IOException { + super.write(datum); + if (datum != null && datum.isNotNull()) { + if (useDictionaryEncoding || !strideDictionaryCheck) { + rows.add(dictionary.add(datum.toString())); + } else { + // write data and length + directStreamOutput.write(datum.asByteArray(), 0, datum.size()); + directLengthOutput.write(datum.size()); + } + byte[] buf = datum.asByteArray(); + indexStatistics.updateString(buf, 0, buf.length, 1); + if (createBloomFilter) { + bloomFilter.addBytes(buf, 0, buf.length); + } + } + } + } + /** * Under the covers, char is written to ORC the same way as string. */ private static class CharTreeWriter extends StringTreeWriter { + private final int itemLength; + private final byte[] padding; CharTreeWriter(int columnId, - ObjectInspector inspector, - StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + TypeDescription schema, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, schema, writer, nullable); + itemLength = schema.getMaxLength(); + padding = new byte[itemLength]; } - } - /** - * Under the covers, varchar is written to ORC the same way as string. - */ - private static class VarcharTreeWriter extends StringTreeWriter { + @Override + void write(Datum datum) throws IOException { + super.write(datum); + if (datum != null && datum.isNotNull()) { + byte[] ptr; + byte[] buf = datum.asByteArray(); + if (buf.length >= itemLength) { + ptr = buf; + } else { + ptr = padding; + System.arraycopy(buf, 0, ptr, 0, buf.length); + Arrays.fill(ptr, buf.length, itemLength, (byte) ' '); + } + if (useDictionaryEncoding || !strideDictionaryCheck) { + rows.add(dictionary.add(ptr, 0, itemLength)); + } else { + // write data and length + directStreamOutput.write(ptr, 0, itemLength); + directLengthOutput.write(itemLength); + } - VarcharTreeWriter(int columnId, - ObjectInspector inspector, - StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + indexStatistics.updateString(ptr, 0, ptr.length, 1); + if (createBloomFilter) { + bloomFilter.addBytes(ptr, 0, ptr.length); + } + } } } @@ -1416,10 +1387,10 @@ private static class BinaryTreeWriter extends TreeWriter { private boolean isDirectV2 = true; BinaryTreeWriter(int columnId, - ObjectInspector inspector, + TypeDescription schema, StreamFactory writer, boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + super(columnId, schema, writer, nullable); this.stream = writer.createStream(id, OrcProto.Stream.Kind.DATA); this.isDirectV2 = isNewWriteFormat(writer); @@ -1442,11 +1413,12 @@ OrcProto.ColumnEncoding getEncoding() { void write(Datum datum) throws IOException { super.write(datum); if (datum != null && datum.isNotNull()) { - stream.write(datum.asByteArray(), 0, datum.size()); + byte[] buf = datum.asByteArray(); + stream.write(buf, 0, buf.length); length.write(datum.size()); - indexStatistics.updateBinary(datum); + indexStatistics.updateBinary(buf, 0, buf.length, 1); if (createBloomFilter) { - bloomFilter.addBytes(datum.asByteArray(), datum.size()); + bloomFilter.addBytes(buf, 0, buf.length); } } } @@ -1478,10 +1450,10 @@ private static class TimestampTreeWriter extends TreeWriter { private TimeZone timeZone; TimestampTreeWriter(int columnId, - ObjectInspector inspector, - StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + TypeDescription schema, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, schema, writer, nullable); this.isDirectV2 = isNewWriteFormat(writer); this.seconds = createIntegerWriter(writer.createStream(id, OrcProto.Stream.Kind.DATA), true, isDirectV2, writer); @@ -1489,7 +1461,7 @@ private static class TimestampTreeWriter extends TreeWriter { OrcProto.Stream.Kind.SECONDARY), false, isDirectV2, writer); recordPosition(rowIndexPosition); // for unit tests to set different time zones - this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / TimeUnit.MILLIS_PER_SECOND; + this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / DateTimeConstants.MSECS_PER_SEC; writer.useWriterTimeZone(true); timeZone = writer.getTimeZone(); } @@ -1515,7 +1487,7 @@ void write(Datum datum) throws IOException { Timestamp val = new Timestamp(javaTimestamp); indexStatistics.updateTimestamp(val); - seconds.write((val.getTime() / TimeUnit.MILLIS_PER_SECOND) - base_timestamp); + seconds.write((val.getTime() / DateTimeConstants.MSECS_PER_SEC) - base_timestamp); nanos.write(formatNanos(val.getNanos())); if (createBloomFilter) { bloomFilter.addLong(val.getTime()); @@ -1561,12 +1533,12 @@ private static class DateTreeWriter extends TreeWriter { private final boolean isDirectV2; DateTreeWriter(int columnId, - ObjectInspector inspector, + TypeDescription schema, StreamFactory writer, boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); + super(columnId, schema, writer, nullable); OutStream out = writer.createStream(id, - OrcProto.Stream.Kind.DATA); + OrcProto.Stream.Kind.DATA); this.isDirectV2 = isNewWriteFormat(writer); this.writer = createIntegerWriter(out, true, isDirectV2, writer); recordPosition(rowIndexPosition); @@ -1612,19 +1584,17 @@ OrcProto.ColumnEncoding getEncoding() { } private static class StructTreeWriter extends TreeWriter { - private final List fields; StructTreeWriter(int columnId, - ObjectInspector inspector, + TypeDescription schema, StreamFactory writer, boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); - StructObjectInspector structObjectInspector = - (StructObjectInspector) inspector; - fields = structObjectInspector.getAllStructFieldRefs(); - childrenWriters = new TreeWriter[fields.size()]; + super(columnId, schema, writer, nullable); + List children = schema.getChildren(); + childrenWriters = new TreeWriter[children.size()]; for(int i=0; i < childrenWriters.length; ++i) { childrenWriters[i] = createTreeWriter( - fields.get(i).getFieldObjectInspector(), writer, true); + children.get(i), writer, + true); } recordPosition(rowIndexPosition); } @@ -1636,9 +1606,8 @@ void write(Datum datum) throws IOException { void writeTuple(Tuple tuple) throws IOException { super.write(tuple); if (tuple != null) { - for(int i = 0; i < fields.size(); ++i) { - TreeWriter writer = childrenWriters[i]; - writer.write(tuple.asDatum(i)); + for(int i = 0; i < childrenWriters.length; ++i) { + childrenWriters[i].write(tuple.asDatum(i)); } } } @@ -1654,159 +1623,136 @@ void writeStripe(OrcProto.StripeFooter.Builder builder, } } - private static TreeWriter createTreeWriter(ObjectInspector inspector, + private static TreeWriter createTreeWriter(TypeDescription schema, StreamFactory streamFactory, boolean nullable) throws IOException { - switch (inspector.getCategory()) { - case PRIMITIVE: - switch (((PrimitiveObjectInspector) inspector).getPrimitiveCategory()) { - case BOOLEAN: - case VOID: - return new BooleanTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); - case BYTE: - return new ByteTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); - case SHORT: - case INT: - case LONG: - return new IntegerTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); - case FLOAT: - return new FloatTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); - case DOUBLE: - return new DoubleTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); - case STRING: - return new StringTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); - case CHAR: - return new CharTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); - case VARCHAR: - return new VarcharTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); - case BINARY: - return new BinaryTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); - case TIMESTAMP: - return new TimestampTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); - case DATE: - return new DateTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); - default: - throw new IllegalArgumentException("Bad primitive category " + - ((PrimitiveObjectInspector) inspector).getPrimitiveCategory()); - } + switch (schema.getCategory()) { + case BOOLEAN: + return new BooleanTreeWriter(streamFactory.getNextColumnId(), + schema, streamFactory, nullable); + case BYTE: + return new ByteTreeWriter(streamFactory.getNextColumnId(), + schema, streamFactory, nullable); + case SHORT: + case INT: + case LONG: + return new IntegerTreeWriter(streamFactory.getNextColumnId(), + schema, streamFactory, nullable); + case FLOAT: + return new FloatTreeWriter(streamFactory.getNextColumnId(), + schema, streamFactory, nullable); + case DOUBLE: + return new DoubleTreeWriter(streamFactory.getNextColumnId(), + schema, streamFactory, nullable); + case STRING: + return new StringTreeWriter(streamFactory.getNextColumnId(), + schema, streamFactory, nullable); + case CHAR: + return new CharTreeWriter(streamFactory.getNextColumnId(), + schema, streamFactory, nullable); + case BINARY: + return new BinaryTreeWriter(streamFactory.getNextColumnId(), + schema, streamFactory, nullable); + case TIMESTAMP: + return new TimestampTreeWriter(streamFactory.getNextColumnId(), + schema, streamFactory, nullable); + case DATE: + return new DateTreeWriter(streamFactory.getNextColumnId(), + schema, streamFactory, nullable); case STRUCT: - return new StructTreeWriter(streamFactory.getNextColumnId(), inspector, - streamFactory, nullable); + return new StructTreeWriter(streamFactory.getNextColumnId(), + schema, streamFactory, nullable); default: throw new IllegalArgumentException("Bad category: " + - inspector.getCategory()); + schema.getCategory()); } } private static void writeTypes(OrcProto.Footer.Builder builder, - TreeWriter treeWriter) { + TypeDescription schema) { OrcProto.Type.Builder type = OrcProto.Type.newBuilder(); - switch (treeWriter.inspector.getCategory()) { - case PRIMITIVE: - switch (((PrimitiveObjectInspector) treeWriter.inspector). - getPrimitiveCategory()) { - case VOID: - case BOOLEAN: - type.setKind(OrcProto.Type.Kind.BOOLEAN); - break; - case BYTE: - type.setKind(OrcProto.Type.Kind.BYTE); - break; - case SHORT: - type.setKind(OrcProto.Type.Kind.SHORT); - break; - case INT: - type.setKind(OrcProto.Type.Kind.INT); - break; - case LONG: - type.setKind(OrcProto.Type.Kind.LONG); - break; - case FLOAT: - type.setKind(OrcProto.Type.Kind.FLOAT); - break; - case DOUBLE: - type.setKind(OrcProto.Type.Kind.DOUBLE); - break; - case STRING: - type.setKind(OrcProto.Type.Kind.STRING); - break; - case CHAR: - // The char length needs to be written to file and should be available - // from the object inspector - CharTypeInfo charTypeInfo = (CharTypeInfo) ((PrimitiveObjectInspector) treeWriter.inspector).getTypeInfo(); - type.setKind(Type.Kind.CHAR); - type.setMaximumLength(charTypeInfo.getLength()); - break; - case VARCHAR: - // The varchar length needs to be written to file and should be available - // from the object inspector - VarcharTypeInfo typeInfo = (VarcharTypeInfo) ((PrimitiveObjectInspector) treeWriter.inspector).getTypeInfo(); - type.setKind(Type.Kind.VARCHAR); - type.setMaximumLength(typeInfo.getLength()); - break; - case BINARY: - type.setKind(OrcProto.Type.Kind.BINARY); - break; - case TIMESTAMP: - type.setKind(OrcProto.Type.Kind.TIMESTAMP); - break; - case DATE: - type.setKind(OrcProto.Type.Kind.DATE); - break; - case DECIMAL: - DecimalTypeInfo decTypeInfo = (DecimalTypeInfo)((PrimitiveObjectInspector)treeWriter.inspector).getTypeInfo(); - type.setKind(OrcProto.Type.Kind.DECIMAL); - type.setPrecision(decTypeInfo.precision()); - type.setScale(decTypeInfo.scale()); - break; - default: - throw new IllegalArgumentException("Unknown primitive category: " + - ((PrimitiveObjectInspector) treeWriter.inspector). - getPrimitiveCategory()); - } + List children = schema.getChildren(); + switch (schema.getCategory()) { + case BOOLEAN: + type.setKind(OrcProto.Type.Kind.BOOLEAN); + break; + case BYTE: + type.setKind(OrcProto.Type.Kind.BYTE); + break; + case SHORT: + type.setKind(OrcProto.Type.Kind.SHORT); + break; + case INT: + type.setKind(OrcProto.Type.Kind.INT); + break; + case LONG: + type.setKind(OrcProto.Type.Kind.LONG); + break; + case FLOAT: + type.setKind(OrcProto.Type.Kind.FLOAT); + break; + case DOUBLE: + type.setKind(OrcProto.Type.Kind.DOUBLE); + break; + case STRING: + type.setKind(OrcProto.Type.Kind.STRING); + break; + case CHAR: + type.setKind(OrcProto.Type.Kind.CHAR); + type.setMaximumLength(schema.getMaxLength()); + break; + case VARCHAR: + type.setKind(OrcProto.Type.Kind.VARCHAR); + type.setMaximumLength(schema.getMaxLength()); + break; + case BINARY: + type.setKind(OrcProto.Type.Kind.BINARY); + break; + case TIMESTAMP: + type.setKind(OrcProto.Type.Kind.TIMESTAMP); + break; + case DATE: + type.setKind(OrcProto.Type.Kind.DATE); + break; + case DECIMAL: + type.setKind(OrcProto.Type.Kind.DECIMAL); + type.setPrecision(schema.getPrecision()); + type.setScale(schema.getScale()); break; case LIST: type.setKind(OrcProto.Type.Kind.LIST); - type.addSubtypes(treeWriter.childrenWriters[0].id); + type.addSubtypes(children.get(0).getId()); break; case MAP: type.setKind(OrcProto.Type.Kind.MAP); - type.addSubtypes(treeWriter.childrenWriters[0].id); - type.addSubtypes(treeWriter.childrenWriters[1].id); + for(TypeDescription t: children) { + type.addSubtypes(t.getId()); + } break; case STRUCT: type.setKind(OrcProto.Type.Kind.STRUCT); - for(TreeWriter child: treeWriter.childrenWriters) { - type.addSubtypes(child.id); + for(TypeDescription t: children) { + type.addSubtypes(t.getId()); } - for(StructField field: ((StructTreeWriter) treeWriter).fields) { - type.addFieldNames(field.getFieldName()); + for(String field: schema.getFieldNames()) { + type.addFieldNames(field); } break; case UNION: type.setKind(OrcProto.Type.Kind.UNION); - for(TreeWriter child: treeWriter.childrenWriters) { - type.addSubtypes(child.id); + for(TypeDescription t: children) { + type.addSubtypes(t.getId()); } break; default: throw new IllegalArgumentException("Unknown category: " + - treeWriter.inspector.getCategory()); + schema.getCategory()); } builder.addTypes(type); - for(TreeWriter child: treeWriter.childrenWriters) { - writeTypes(builder, child); + if (children != null) { + for(TypeDescription child: children) { + writeTypes(builder, child); + } } } @@ -1853,9 +1799,9 @@ private void flushStripe() throws IOException { StreamName name = pair.getKey(); long streamSize = pair.getValue().getOutputSize(); builder.addStreams(OrcProto.Stream.newBuilder() - .setColumn(name.getColumn()) - .setKind(name.getKind()) - .setLength(streamSize)); + .setColumn(name.getColumn()) + .setKind(name.getKind()) + .setLength(streamSize)); if (StreamName.Area.INDEX == name.getArea()) { indexSize += streamSize; } else { @@ -1880,8 +1826,8 @@ private void flushStripe() throws IOException { // and user specified padding tolerance. Since stripe size can overflow // the default stripe size we should apply this correction to avoid // writing portion of last stripe to next hdfs block. - float correction = overflow > 0 ? (float) overflow - / (float) adjustedStripeSize : 0.0f; + double correction = overflow > 0 ? (double) overflow + / (double) adjustedStripeSize : 0.0; // correction should not be greater than user specified padding // tolerance @@ -1939,75 +1885,60 @@ private void flushStripe() throws IOException { } private long computeRawDataSize() { - long result = 0; - for (TreeWriter child : treeWriter.getChildrenWriters()) { - result += getRawDataSizeFromInspectors(child, child.inspector); - } - return result; + return getRawDataSize(treeWriter, schema); } - private long getRawDataSizeFromInspectors(TreeWriter child, ObjectInspector oi) { + private long getRawDataSize(TreeWriter child, + TypeDescription schema) { long total = 0; - switch (oi.getCategory()) { - case PRIMITIVE: - total += getRawDataSizeFromPrimitives(child, oi); - break; - case LIST: - case MAP: - case UNION: - case STRUCT: - for (TreeWriter tw : child.childrenWriters) { - total += getRawDataSizeFromInspectors(tw, tw.inspector); + long numVals = child.fileStatistics.getNumberOfValues(); + switch (schema.getCategory()) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case FLOAT: + return numVals * JavaDataModel.get().primitive1(); + case LONG: + case DOUBLE: + return numVals * JavaDataModel.get().primitive2(); + case STRING: + case VARCHAR: + case CHAR: + // ORC strings are converted to java Strings. so use JavaDataModel to + // compute the overall size of strings + StringColumnStatistics scs = (StringColumnStatistics) child.fileStatistics; + numVals = numVals == 0 ? 1 : numVals; + int avgStringLen = (int) (scs.getSum() / numVals); + return numVals * JavaDataModel.get().lengthForStringOfLength(avgStringLen); + case DECIMAL: + return numVals * JavaDataModel.get().lengthOfDecimal(); + case DATE: + return numVals * JavaDataModel.get().lengthOfDate(); + case BINARY: + // get total length of binary blob + BinaryColumnStatistics bcs = (BinaryColumnStatistics) child.fileStatistics; + return bcs.getSum(); + case TIMESTAMP: + return numVals * JavaDataModel.get().lengthOfTimestamp(); + case LIST: + case MAP: + case UNION: + case STRUCT: { + TreeWriter[] childWriters = child.getChildrenWriters(); + List childTypes = schema.getChildren(); + for (int i=0; i < childWriters.length; ++i) { + total += getRawDataSize(childWriters[i], childTypes.get(i)); + } + break; } - break; - default: - LOG.debug("Unknown object inspector category."); - break; + default: + LOG.debug("Unknown object inspector category."); + break; } return total; } - private long getRawDataSizeFromPrimitives(TreeWriter child, ObjectInspector oi) { - long result = 0; - long numVals = child.fileStatistics.getNumberOfValues(); - switch (((PrimitiveObjectInspector) oi).getPrimitiveCategory()) { - case BOOLEAN: - case BYTE: - case SHORT: - case INT: - case FLOAT: - return numVals * JavaDataModel.get().primitive1(); - case LONG: - case DOUBLE: - return numVals * JavaDataModel.get().primitive2(); - case STRING: - case VARCHAR: - case CHAR: - // ORC strings are converted to java Strings. so use JavaDataModel to - // compute the overall size of strings - child = (StringTreeWriter) child; - StringColumnStatistics scs = (StringColumnStatistics) child.fileStatistics; - numVals = numVals == 0 ? 1 : numVals; - int avgStringLen = (int) (scs.getSum() / numVals); - return numVals * JavaDataModel.get().lengthForStringOfLength(avgStringLen); - case DECIMAL: - return numVals * JavaDataModel.get().lengthOfDecimal(); - case DATE: - return numVals * JavaDataModel.get().lengthOfDate(); - case BINARY: - // get total length of binary blob - BinaryColumnStatistics bcs = (BinaryColumnStatistics) child.fileStatistics; - return bcs.getSum(); - case TIMESTAMP: - return numVals * JavaDataModel.get().lengthOfTimestamp(); - default: - LOG.debug("Unknown primitive category."); - break; - } - - return result; - } - private OrcProto.CompressionKind writeCompressionKind(CompressionKind kind) { switch (kind) { case NONE: return OrcProto.CompressionKind.NONE; @@ -2027,7 +1958,7 @@ private void writeFileStatistics(OrcProto.Footer.Builder builder, } } - private int writeMetadata(long bodyLength) throws IOException { + private int writeMetadata() throws IOException { getStream(); OrcProto.Metadata.Builder builder = OrcProto.Metadata.newBuilder(); for(OrcProto.StripeStatistics.Builder ssb : treeWriter.stripeStatsBuilders) { @@ -2052,7 +1983,7 @@ private int writeFooter(long bodyLength) throws IOException { // populate raw data size rawDataSize = computeRawDataSize(); // serialize the types - writeTypes(builder, treeWriter); + writeTypes(builder, schema); // add the stripe information for(OrcProto.StripeInformation stripe: stripes) { builder.addStripes(stripe); @@ -2062,7 +1993,7 @@ private int writeFooter(long bodyLength) throws IOException { // add all of the user metadata for(Map.Entry entry: userMetadata.entrySet()) { builder.addMetadata(OrcProto.UserMetadataItem.newBuilder() - .setName(entry.getKey()).setValue(entry.getValue())); + .setName(entry.getKey()).setValue(entry.getValue())); } long startPosn = rawWriter.getPos(); OrcProto.Footer footer = builder.build(); @@ -2074,14 +2005,14 @@ private int writeFooter(long bodyLength) throws IOException { private int writePostScript(int footerLength, int metadataLength) throws IOException { OrcProto.PostScript.Builder builder = - OrcProto.PostScript.newBuilder() - .setCompression(writeCompressionKind(compress)) - .setFooterLength(footerLength) - .setMetadataLength(metadataLength) - .setMagic(OrcFile.MAGIC) - .addVersion(version.getMajor()) - .addVersion(version.getMinor()) - .setWriterVersion(OrcFile.WriterVersion.HIVE_8732.getId()); + OrcProto.PostScript.newBuilder() + .setCompression(writeCompressionKind(compress)) + .setFooterLength(footerLength) + .setMetadataLength(metadataLength) + .setMagic(OrcFile.MAGIC) + .addVersion(version.getMajor()) + .addVersion(version.getMinor()) + .setWriterVersion(OrcFile.CURRENT_WRITER.getId()); if (compress != CompressionKind.NONE) { builder.setCompressionBlockSize(bufferSize); } @@ -2120,7 +2051,7 @@ public void addTuple(Tuple tuple) throws IOException { createRowIndexEntry(); } } - memoryManager.addedRow(); + memoryManager.addedRow(1); } @Override @@ -2132,7 +2063,7 @@ public void close() throws IOException { memoryManager.removeWriter(path); // actually close the file flushStripe(); - int metadataLength = writeMetadata(rawWriter.getPos()); + int metadataLength = writeMetadata(); int footerLength = writeFooter(rawWriter.getPos() - metadataLength); rawWriter.writeByte(writePostScript(footerLength, metadataLength)); rawWriter.close(); @@ -2165,19 +2096,19 @@ public long writeIntermediateFooter() throws IOException { if (callback != null) { callback.preFooterWrite(callbackContext); } - int metaLength = writeMetadata(rawWriter.getPos()); + int metaLength = writeMetadata(); int footLength = writeFooter(rawWriter.getPos() - metaLength); rawWriter.writeByte(writePostScript(footLength, metaLength)); stripesAtLastFlush = stripes.size(); - ShimLoader.getHadoopShims().hflush(rawWriter); + rawWriter.hflush(); } return rawWriter.getPos(); } @Override public void appendStripe(byte[] stripe, int offset, int length, - StripeInformation stripeInfo, - OrcProto.StripeStatistics stripeStatistics) throws IOException { + StripeInformation stripeInfo, + OrcProto.StripeStatistics stripeStatistics) throws IOException { checkArgument(stripe != null, "Stripe must not be null"); checkArgument(length <= stripe.length, "Specified length must not be greater specified array length"); @@ -2187,12 +2118,11 @@ public void appendStripe(byte[] stripe, int offset, int length, getStream(); long start = rawWriter.getPos(); - long stripeLen = length; long availBlockSpace = blockSize - (start % blockSize); // see if stripe can fit in the current hdfs block, else pad the remaining // space in the block - if (stripeLen < blockSize && stripeLen > availBlockSpace && + if (length < blockSize && length > availBlockSpace && addBlockPadding) { byte[] pad = new byte[(int) Math.min(HDFS_BUFFER_SIZE, availBlockSpace)]; LOG.info(String.format("Padding ORC by %d bytes while merging..", @@ -2245,7 +2175,7 @@ private List getAllColumnTreeWriters(TreeWriter rootTreeWriter) { } private void getAllColumnTreeWritersImpl(TreeWriter tw, - List result) { + List result) { result.add(tw); for (TreeWriter child : tw.childrenWriters) { getAllColumnTreeWritersImpl(child, result); @@ -2253,9 +2183,9 @@ private void getAllColumnTreeWritersImpl(TreeWriter tw, } @Override - public void appendUserMetadata(List userMetadata) { + public void appendUserMetadata(List userMetadata) { if (userMetadata != null) { - for (UserMetadataItem item : userMetadata) { + for (OrcProto.UserMetadataItem item : userMetadata) { this.userMetadata.put(item.getName(), item.getValue()); } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZlibCodec.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZlibCodec.java deleted file mode 100644 index d0a8fa7da3..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZlibCodec.java +++ /dev/null @@ -1,169 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import org.apache.hadoop.hive.shims.HadoopShims.DirectCompressionType; -import org.apache.hadoop.hive.shims.HadoopShims.DirectDecompressorShim; -import org.apache.hadoop.hive.shims.ShimLoader; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.EnumSet; -import java.util.zip.DataFormatException; -import java.util.zip.Deflater; -import java.util.zip.Inflater; - -class ZlibCodec implements CompressionCodec, DirectDecompressionCodec { - - private Boolean direct = null; - - private final int level; - private final int strategy; - - public ZlibCodec() { - level = Deflater.DEFAULT_COMPRESSION; - strategy = Deflater.DEFAULT_STRATEGY; - } - - private ZlibCodec(int level, int strategy) { - this.level = level; - this.strategy = strategy; - } - - @Override - public boolean compress(ByteBuffer in, ByteBuffer out, - ByteBuffer overflow) throws IOException { - Deflater deflater = new Deflater(level, true); - deflater.setStrategy(strategy); - int length = in.remaining(); - deflater.setInput(in.array(), in.arrayOffset() + in.position(), length); - deflater.finish(); - int outSize = 0; - int offset = out.arrayOffset() + out.position(); - while (!deflater.finished() && (length > outSize)) { - int size = deflater.deflate(out.array(), offset, out.remaining()); - out.position(size + out.position()); - outSize += size; - offset += size; - // if we run out of space in the out buffer, use the overflow - if (out.remaining() == 0) { - if (overflow == null) { - deflater.end(); - return false; - } - out = overflow; - offset = out.arrayOffset() + out.position(); - } - } - deflater.end(); - return length > outSize; - } - - @Override - public void decompress(ByteBuffer in, ByteBuffer out) throws IOException { - - if(in.isDirect() && out.isDirect()) { - directDecompress(in, out); - return; - } - - Inflater inflater = new Inflater(true); - inflater.setInput(in.array(), in.arrayOffset() + in.position(), - in.remaining()); - while (!(inflater.finished() || inflater.needsDictionary() || - inflater.needsInput())) { - try { - int count = inflater.inflate(out.array(), - out.arrayOffset() + out.position(), - out.remaining()); - out.position(count + out.position()); - } catch (DataFormatException dfe) { - throw new IOException("Bad compression data", dfe); - } - } - out.flip(); - inflater.end(); - in.position(in.limit()); - } - - @Override - public boolean isAvailable() { - if (direct == null) { - // see nowrap option in new Inflater(boolean) which disables zlib headers - try { - if (ShimLoader.getHadoopShims().getDirectDecompressor( - DirectCompressionType.ZLIB_NOHEADER) != null) { - direct = Boolean.valueOf(true); - } else { - direct = Boolean.valueOf(false); - } - } catch (UnsatisfiedLinkError ule) { - direct = Boolean.valueOf(false); - } - } - return direct.booleanValue(); - } - - @Override - public void directDecompress(ByteBuffer in, ByteBuffer out) - throws IOException { - DirectDecompressorShim decompressShim = ShimLoader.getHadoopShims() - .getDirectDecompressor(DirectCompressionType.ZLIB_NOHEADER); - decompressShim.decompress(in, out); - out.flip(); // flip for read - } - - @Override - public CompressionCodec modify(@Nullable EnumSet modifiers) { - - if (modifiers == null) { - return this; - } - - int l = this.level; - int s = this.strategy; - - for (Modifier m : modifiers) { - switch (m) { - case BINARY: - /* filtered == less LZ77, more huffman */ - s = Deflater.FILTERED; - break; - case TEXT: - s = Deflater.DEFAULT_STRATEGY; - break; - case FASTEST: - // deflate_fast looking for 8 byte patterns - l = Deflater.BEST_SPEED; - break; - case FAST: - // deflate_fast looking for 16 byte patterns - l = Deflater.BEST_SPEED + 1; - break; - case DEFAULT: - // deflate_slow looking for 128 byte patterns - l = Deflater.DEFAULT_COMPRESSION; - break; - default: - break; - } - } - return new ZlibCodec(l, s); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto b/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto index c80cf6c269..9da4b5d889 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto +++ b/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto @@ -16,7 +16,9 @@ * limitations under the License. */ -package org.apache.tajo.storage.thirdparty.orc; +package orc.proto; + +option java_package = "org.apache.orc"; message IntegerStatistics { optional sint64 minimum = 1; diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java index cc3f46399b..608d066913 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestCompressionStorages.java @@ -27,6 +27,7 @@ import org.apache.hadoop.io.compress.*; import org.apache.hadoop.io.compress.zlib.ZlibFactory; import org.apache.hadoop.util.NativeCodeLoader; +import org.apache.orc.OrcConf; import org.apache.tajo.BuiltinStorages; import org.apache.tajo.catalog.CatalogUtil; import org.apache.tajo.catalog.Schema; @@ -38,7 +39,6 @@ import org.apache.tajo.storage.fragment.FileFragment; import org.apache.tajo.storage.sequencefile.SequenceFileScanner; import org.apache.tajo.storage.text.DelimitedTextFile; -import org.apache.tajo.storage.thirdparty.orc.OrcFile.OrcTableProperties; import org.apache.tajo.util.CommonTestingUtil; import org.junit.Test; import org.junit.runner.RunWith; @@ -124,11 +124,11 @@ private void storageCompressionTest(String dataFormat, Class Date: Sun, 20 Mar 2016 00:15:50 +0900 Subject: [PATCH 03/16] Enable reader options. --- .../apache/tajo/storage/orc/OrcScanner.java | 21 ++++++++--- .../thirdparty/orc/OrcRecordReader.java | 37 ++++++++----------- .../thirdparty/orc/TreeReaderFactory.java | 27 +++++--------- 3 files changed, 41 insertions(+), 44 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 8082819bff..0f249cf49a 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -29,12 +29,15 @@ import org.apache.hadoop.hive.common.io.DiskRange; import org.apache.hadoop.io.Text; import org.apache.orc.*; +import org.apache.orc.Reader.Options; import org.apache.orc.impl.BufferChunk; import org.apache.orc.impl.InStream; +import org.apache.tajo.TajoConstants; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.TableMeta; import org.apache.tajo.plan.expr.EvalNode; import org.apache.tajo.storage.FileScanner; +import org.apache.tajo.storage.StorageConstants; import org.apache.tajo.storage.Tuple; import org.apache.tajo.storage.fragment.Fragment; import org.apache.tajo.storage.thirdparty.orc.OrcRecordReader; @@ -44,6 +47,7 @@ import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; +import java.util.TimeZone; public class OrcScanner extends FileScanner { private static final Log LOG = LogFactory.getLog(OrcScanner.class); @@ -230,11 +234,16 @@ private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs, ); } - public OrcRecordReader getRecordReader() throws IOException { - boolean skipCorruptRecords = conf.getBoolean("orc.skip.corrupt-records", false); + public OrcRecordReader createRecordReader() throws IOException { + return new OrcRecordReader(this.stripes, fileSystem, schema, targets, fragment, types, codec, bufferSize, + rowIndexStride, buildReaderOptions(meta), conf, + TimeZone.getTimeZone(meta.getProperty(StorageConstants.TIMEZONE, TajoConstants.DEFAULT_SYSTEM_TIMEZONE))); + } - return new OrcRecordReader(meta, this.stripes, fileSystem, schema, targets, fragment, - skipCorruptRecords, types, codec, bufferSize, rowIndexStride, conf); + private static Options buildReaderOptions(TableMeta meta) { + return new Options() + .useZeroCopy(Boolean.parseBoolean(meta.getProperty(OrcConf.USE_ZEROCOPY.getAttribute(), String.valueOf(OrcConf.USE_ZEROCOPY.getDefaultValue())))) + .skipCorruptRecords(Boolean.parseBoolean(meta.getProperty(OrcConf.SKIP_CORRUPT_DATA.getAttribute(), String.valueOf(OrcConf.SKIP_CORRUPT_DATA.getDefaultValue())))); } @Override @@ -264,7 +273,7 @@ public void init() throws IOException { this.versionList = footerMetaData.versionList; this.stripes = convertProtoStripesToStripes(rInfo.footer.getStripesList()); - recordReader = getRecordReader(); + recordReader = createRecordReader(); } @Override @@ -280,7 +289,7 @@ public Tuple next() throws IOException { public void reset() throws IOException { // TODO: improve this this.close(); - recordReader = getRecordReader(); + recordReader = createRecordReader(); } @Override diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java index c018c802d8..7194bf4d50 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java @@ -39,10 +39,7 @@ import java.io.Closeable; import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; public class OrcRecordReader implements Closeable { @@ -72,19 +69,18 @@ public class OrcRecordReader implements Closeable { private final DataReader dataReader; private final Tuple result; - public OrcRecordReader(TableMeta meta, - List stripes, - FileSystem fileSystem, - Schema schema, - Column[] target, - FileFragment fragment, - boolean skipCorruptRecords, - List types, - CompressionCodec codec, - int bufferSize, - long strideRate, - Configuration conf - ) throws IOException { + public OrcRecordReader(List stripes, + FileSystem fileSystem, + Schema schema, + Column[] target, + FileFragment fragment, + List types, + CompressionCodec codec, + int bufferSize, + long strideRate, + Reader.Options options, + Configuration conf, + TimeZone timeZone) throws IOException { result = new VTuple(target.length); @@ -117,17 +113,16 @@ public OrcRecordReader(TableMeta meta, } // TODO: we could change the ctor to pass this externally - this.dataReader = RecordReaderUtils.createDefaultDataReader(fileSystem, path, true, codec); + this.dataReader = RecordReaderUtils.createDefaultDataReader(fileSystem, path, options.getUseZeroCopy(), codec); this.dataReader.open(); firstRow = skippedRows; totalRowCount = rows; - Boolean skipCorrupt = skipCorruptRecords; reader = new DatumTreeReader[target.length]; for (int i = 0; i < reader.length; i++) { - reader[i] = TreeReaderFactory.createTreeReader(meta, schema.getColumnId(target[i].getQualifiedName()), target[i], - skipCorrupt); + reader[i] = TreeReaderFactory.createTreeReader(timeZone, schema.getColumnId(target[i].getQualifiedName()), target[i], + options.getSkipCorruptRecords()); } indexes = new OrcProto.RowIndex[types.size()]; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java index 136e5a7b5d..b31523f32b 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java @@ -24,21 +24,14 @@ import org.apache.hadoop.io.Text; import org.apache.orc.OrcProto; import org.apache.orc.impl.*; -import org.apache.orc.impl.DynamicByteArray; -import org.apache.orc.impl.SerializationUtils; -import org.apache.orc.impl.StreamName; import org.apache.orc.impl.WriterImpl; -import org.apache.tajo.TajoConstants; import org.apache.tajo.catalog.Column; -import org.apache.tajo.catalog.TableMeta; import org.apache.tajo.catalog.TypeDesc; import org.apache.tajo.datum.Datum; import org.apache.tajo.datum.DatumFactory; import org.apache.tajo.datum.NullDatum; import org.apache.tajo.exception.TajoRuntimeException; import org.apache.tajo.exception.UnsupportedException; -import org.apache.tajo.storage.StorageConstants; -import org.apache.tajo.unit.TimeUnit; import org.apache.tajo.util.datetime.DateTimeConstants; import org.apache.tajo.util.datetime.DateTimeUtil; @@ -759,11 +752,11 @@ public static class TimestampTreeReader extends DatumTreeReader { private TimeZone writerTimeZone; private boolean hasSameTZRules; - TimestampTreeReader(TableMeta meta, int columnId, boolean skipCorrupt) throws IOException { - this(meta, columnId, null, null, null, null, skipCorrupt); + TimestampTreeReader(TimeZone timeZone, int columnId, boolean skipCorrupt) throws IOException { + this(timeZone, columnId, null, null, null, null, skipCorrupt); } - protected TimestampTreeReader(TableMeta meta, int columnId, InStream presentStream, InStream dataStream, + protected TimestampTreeReader(TimeZone timeZone, int columnId, InStream presentStream, InStream dataStream, InStream nanosStream, OrcProto.ColumnEncoding encoding, boolean skipCorrupt) throws IOException { super(columnId, presentStream); @@ -772,8 +765,7 @@ protected TimestampTreeReader(TableMeta meta, int columnId, InStream presentStre this.readerTimeZone = TimeZone.getDefault(); this.writerTimeZone = readerTimeZone; this.hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone); - this.base_timestamp = getBaseTimestamp(TimeZone.getTimeZone(meta.getProperty(StorageConstants.TIMEZONE, - TajoConstants.DEFAULT_SYSTEM_TIMEZONE)).getID()); + this.base_timestamp = getBaseTimestamp(timeZone.getID()); if (encoding != null) { checkEncoding(encoding); @@ -1414,6 +1406,7 @@ void skipRows(long items) throws IOException { } } + // TODO: enable this to support record type // protected static class StructTreeReader extends TreeReader { // private final int fileColumnCount; // private final int resultColumnCount; @@ -1533,10 +1526,10 @@ void skipRows(long items) throws IOException { // } // } - public static DatumTreeReader createTreeReader(TableMeta meta, - int columnId, - Column column, - boolean skipCorrupt + public static DatumTreeReader createTreeReader(TimeZone timeZone, + int columnId, + Column column, + boolean skipCorrupt ) throws IOException { TypeDesc typeDesc = column.getTypeDesc(); int orcColumnId = columnId + 1; // root record column is considered @@ -1562,7 +1555,7 @@ public static DatumTreeReader createTreeReader(TableMeta meta, case BLOB: return new BinaryTreeReader(orcColumnId); case TIMESTAMP: - return new TimestampTreeReader(meta, orcColumnId, skipCorrupt); + return new TimestampTreeReader(timeZone, orcColumnId, skipCorrupt); case DATE: return new DateTreeReader(orcColumnId); case INET4: From 6414385a139850cd7819762c9e4868f7c055d3c7 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Sun, 20 Mar 2016 00:23:51 +0900 Subject: [PATCH 04/16] Remove proto file and cleanup code. --- .../catalog/store/TestHiveCatalogStore.java | 2 +- tajo-storage/tajo-storage-hdfs/pom.xml | 1 - .../apache/tajo/storage/orc/ORCAppender.java | 24 +- .../apache/tajo/storage/orc/OrcScanner.java | 6 +- .../tajo/storage/thirdparty/orc/OrcUtils.java | 7 +- .../storage/thirdparty/orc/WriterImpl.java | 1 - .../src/main/proto/orc_proto.proto | 219 ------------------ 7 files changed, 23 insertions(+), 237 deletions(-) delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java index 6bb66a1a46..46935fc259 100644 --- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java +++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/test/java/org/apache/tajo/catalog/store/TestHiveCatalogStore.java @@ -78,7 +78,7 @@ public static void setUp() throws Exception { conf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, warehousePath.toUri().toString()); conf.set(HiveConf.ConfVars.METASTORECONNECTURLKEY.varname, jdbcUri); conf.set(TajoConf.ConfVars.WAREHOUSE_DIR.varname, warehousePath.toUri().toString()); - conf.setBoolean("datanucleus.schema.autoCreateAll", true); // TODO: check this is valid + conf.setBoolean("datanucleus.schema.autoCreateAll", true); // create local HiveCatalogStore. TajoConf tajoConf = new TajoConf(conf); diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index 2c5da75ef7..8688b29279 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -129,7 +129,6 @@ --proto_path=../../tajo-catalog/tajo-catalog-common/src/main/proto --java_out=target/generated-sources/proto src/main/proto/StorageFragmentProtos.proto - diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java index ec4349628c..b283b2219b 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java @@ -111,14 +111,22 @@ private static OrcFile.WriterOptions buildWriterOptions(Configuration conf, Tabl return OrcFile.writerOptions(conf) .setSchema(OrcUtils.convertSchema(schema)) .compress(getCompressionKind(meta)) - .stripeSize(Long.parseLong(meta.getProperty(OrcConf.STRIPE_SIZE.getAttribute(), String.valueOf(OrcConf.STRIPE_SIZE.getDefaultValue())))) - .blockSize(Long.parseLong(meta.getProperty(OrcConf.BLOCK_SIZE.getAttribute(), String.valueOf(OrcConf.BLOCK_SIZE.getDefaultValue())))) - .rowIndexStride(Integer.parseInt(meta.getProperty(OrcConf.ROW_INDEX_STRIDE.getAttribute(), String.valueOf(OrcConf.ROW_INDEX_STRIDE.getDefaultValue())))) - .bufferSize(Integer.parseInt(meta.getProperty(OrcConf.BUFFER_SIZE.getAttribute(), String.valueOf(OrcConf.BUFFER_SIZE.getDefaultValue())))) - .blockPadding(Boolean.parseBoolean(meta.getProperty(OrcConf.BLOCK_PADDING.getAttribute(), String.valueOf(OrcConf.BLOCK_PADDING.getDefaultValue())))) - .encodingStrategy(EncodingStrategy.valueOf(meta.getProperty(OrcConf.ENCODING_STRATEGY.getAttribute(), String.valueOf(OrcConf.ENCODING_STRATEGY.getDefaultValue())))) - .bloomFilterFpp(Double.parseDouble(meta.getProperty(OrcConf.BLOOM_FILTER_FPP.getAttribute(), String.valueOf(OrcConf.BLOOM_FILTER_FPP.getDefaultValue())))) - .bloomFilterColumns(meta.getProperty(OrcConf.BLOOM_FILTER_COLUMNS.getAttribute(), String.valueOf(OrcConf.BLOOM_FILTER_COLUMNS.getDefaultValue()))); + .stripeSize(Long.parseLong(meta.getProperty(OrcConf.STRIPE_SIZE.getAttribute(), + String.valueOf(OrcConf.STRIPE_SIZE.getDefaultValue())))) + .blockSize(Long.parseLong(meta.getProperty(OrcConf.BLOCK_SIZE.getAttribute(), + String.valueOf(OrcConf.BLOCK_SIZE.getDefaultValue())))) + .rowIndexStride(Integer.parseInt(meta.getProperty(OrcConf.ROW_INDEX_STRIDE.getAttribute(), + String.valueOf(OrcConf.ROW_INDEX_STRIDE.getDefaultValue())))) + .bufferSize(Integer.parseInt(meta.getProperty(OrcConf.BUFFER_SIZE.getAttribute(), + String.valueOf(OrcConf.BUFFER_SIZE.getDefaultValue())))) + .blockPadding(Boolean.parseBoolean(meta.getProperty(OrcConf.BLOCK_PADDING.getAttribute(), + String.valueOf(OrcConf.BLOCK_PADDING.getDefaultValue())))) + .encodingStrategy(EncodingStrategy.valueOf(meta.getProperty(OrcConf.ENCODING_STRATEGY.getAttribute(), + String.valueOf(OrcConf.ENCODING_STRATEGY.getDefaultValue())))) + .bloomFilterFpp(Double.parseDouble(meta.getProperty(OrcConf.BLOOM_FILTER_FPP.getAttribute(), + String.valueOf(OrcConf.BLOOM_FILTER_FPP.getDefaultValue())))) + .bloomFilterColumns(meta.getProperty(OrcConf.BLOOM_FILTER_COLUMNS.getAttribute(), + String.valueOf(OrcConf.BLOOM_FILTER_COLUMNS.getDefaultValue()))); } private static CompressionKind getCompressionKind(TableMeta meta) { diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 0f249cf49a..5d9dfac54f 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -242,8 +242,10 @@ rowIndexStride, buildReaderOptions(meta), conf, private static Options buildReaderOptions(TableMeta meta) { return new Options() - .useZeroCopy(Boolean.parseBoolean(meta.getProperty(OrcConf.USE_ZEROCOPY.getAttribute(), String.valueOf(OrcConf.USE_ZEROCOPY.getDefaultValue())))) - .skipCorruptRecords(Boolean.parseBoolean(meta.getProperty(OrcConf.SKIP_CORRUPT_DATA.getAttribute(), String.valueOf(OrcConf.SKIP_CORRUPT_DATA.getDefaultValue())))); + .useZeroCopy(Boolean.parseBoolean(meta.getProperty(OrcConf.USE_ZEROCOPY.getAttribute(), + String.valueOf(OrcConf.USE_ZEROCOPY.getDefaultValue())))) + .skipCorruptRecords(Boolean.parseBoolean(meta.getProperty(OrcConf.SKIP_CORRUPT_DATA.getAttribute(), + String.valueOf(OrcConf.SKIP_CORRUPT_DATA.getDefaultValue())))); } @Override diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java index 91e4dc60d4..cc0e08e20a 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java @@ -17,20 +17,17 @@ */ package org.apache.tajo.storage.thirdparty.orc; -import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.serde2.objectinspector.*; -import org.apache.hadoop.hive.serde2.typeinfo.*; import org.apache.orc.CompressionCodec; import org.apache.orc.TypeDescription; -import org.apache.orc.TypeDescription.Category; import org.apache.orc.impl.SnappyCodec; +import org.apache.orc.impl.ZlibCodec; import org.apache.tajo.catalog.Column; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.TypeDesc; -import org.apache.tajo.common.TajoDataTypes.Type; import org.apache.tajo.exception.TajoRuntimeException; import org.apache.tajo.exception.UnsupportedDataTypeException; @@ -215,7 +212,7 @@ public static org.apache.orc.CompressionCodec createCodec(org.apache.orc.Compres case NONE: return null; case ZLIB: - return new org.apache.orc.impl.ZlibCodec(); + return new ZlibCodec(); case SNAPPY: return new SnappyCodec(); case LZO: diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java index 032885dece..e0ad3d7bed 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java @@ -870,7 +870,6 @@ void write(Datum datum) throws IOException { super.write(datum); if (datum != null && datum.isNotNull()) { boolean val = datum.asBool(); - // TODO: validate the below line indexStatistics.updateBoolean(val, 1); writer.write(val ? 1 : 0); } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto b/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto deleted file mode 100644 index 9da4b5d889..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto +++ /dev/null @@ -1,219 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package orc.proto; - -option java_package = "org.apache.orc"; - -message IntegerStatistics { - optional sint64 minimum = 1; - optional sint64 maximum = 2; - optional sint64 sum = 3; -} - -message DoubleStatistics { - optional double minimum = 1; - optional double maximum = 2; - optional double sum = 3; -} - -message StringStatistics { - optional string minimum = 1; - optional string maximum = 2; - // sum will store the total length of all strings in a stripe - optional sint64 sum = 3; -} - -message BucketStatistics { - repeated uint64 count = 1 [packed=true]; -} - -message DecimalStatistics { - optional string minimum = 1; - optional string maximum = 2; - optional string sum = 3; -} - -message DateStatistics { - // min,max values saved as days since epoch - optional sint32 minimum = 1; - optional sint32 maximum = 2; -} - -message TimestampStatistics { - // min,max values saved as milliseconds since epoch - optional sint64 minimum = 1; - optional sint64 maximum = 2; -} - -message BinaryStatistics { - // sum will store the total binary blob length in a stripe - optional sint64 sum = 1; -} - -message ColumnStatistics { - optional uint64 numberOfValues = 1; - optional IntegerStatistics intStatistics = 2; - optional DoubleStatistics doubleStatistics = 3; - optional StringStatistics stringStatistics = 4; - optional BucketStatistics bucketStatistics = 5; - optional DecimalStatistics decimalStatistics = 6; - optional DateStatistics dateStatistics = 7; - optional BinaryStatistics binaryStatistics = 8; - optional TimestampStatistics timestampStatistics = 9; - optional bool hasNull = 10; -} - -message RowIndexEntry { - repeated uint64 positions = 1 [packed=true]; - optional ColumnStatistics statistics = 2; -} - -message RowIndex { - repeated RowIndexEntry entry = 1; -} - -message BloomFilter { - optional uint32 numHashFunctions = 1; - repeated fixed64 bitset = 2; -} - -message BloomFilterIndex { - repeated BloomFilter bloomFilter = 1; -} - -message Stream { - // if you add new index stream kinds, you need to make sure to update - // StreamName to ensure it is added to the stripe in the right area - enum Kind { - PRESENT = 0; - DATA = 1; - LENGTH = 2; - DICTIONARY_DATA = 3; - DICTIONARY_COUNT = 4; - SECONDARY = 5; - ROW_INDEX = 6; - BLOOM_FILTER = 7; - } - optional Kind kind = 1; - optional uint32 column = 2; - optional uint64 length = 3; -} - -message ColumnEncoding { - enum Kind { - DIRECT = 0; - DICTIONARY = 1; - DIRECT_V2 = 2; - DICTIONARY_V2 = 3; - } - optional Kind kind = 1; - optional uint32 dictionarySize = 2; -} - -message StripeFooter { - repeated Stream streams = 1; - repeated ColumnEncoding columns = 2; - optional string writerTimezone = 3; -} - -message Type { - enum Kind { - BOOLEAN = 0; - BYTE = 1; - SHORT = 2; - INT = 3; - LONG = 4; - FLOAT = 5; - DOUBLE = 6; - STRING = 7; - BINARY = 8; - TIMESTAMP = 9; - LIST = 10; - MAP = 11; - STRUCT = 12; - UNION = 13; - DECIMAL = 14; - DATE = 15; - VARCHAR = 16; - CHAR = 17; - } - optional Kind kind = 1; - repeated uint32 subtypes = 2 [packed=true]; - repeated string fieldNames = 3; - optional uint32 maximumLength = 4; - optional uint32 precision = 5; - optional uint32 scale = 6; -} - -message StripeInformation { - optional uint64 offset = 1; - optional uint64 indexLength = 2; - optional uint64 dataLength = 3; - optional uint64 footerLength = 4; - optional uint64 numberOfRows = 5; -} - -message UserMetadataItem { - optional string name = 1; - optional bytes value = 2; -} - -message StripeStatistics { - repeated ColumnStatistics colStats = 1; -} - -message Metadata { - repeated StripeStatistics stripeStats = 1; -} - -message Footer { - optional uint64 headerLength = 1; - optional uint64 contentLength = 2; - repeated StripeInformation stripes = 3; - repeated Type types = 4; - repeated UserMetadataItem metadata = 5; - optional uint64 numberOfRows = 6; - repeated ColumnStatistics statistics = 7; - optional uint32 rowIndexStride = 8; -} - -enum CompressionKind { - NONE = 0; - ZLIB = 1; - SNAPPY = 2; - LZO = 3; -} - -// Serialized length must be less that 255 bytes -message PostScript { - optional uint64 footerLength = 1; - optional CompressionKind compression = 2; - optional uint64 compressionBlockSize = 3; - // the version of the file format - // [0, 11] = Hive 0.11 - // [0, 12] = Hive 0.12 - repeated uint32 version = 4 [packed = true]; - optional uint64 metadataLength = 5; - // Version of the writer: - // 0 (or missing) = original - // 1 = HIVE-8732 fixed - optional uint32 writerVersion = 6; - // Leave this last in the record - optional string magic = 8000; -} From 2b9dd2443a5ae1133031846fe7d639a35e7ff8a5 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Sun, 20 Mar 2016 13:28:12 +0900 Subject: [PATCH 05/16] Fix test failure --- .../src/main/resources/storage-default.xml | 2 +- .../src/test/resources/storage-default.xml | 2 +- .../apache/tajo/storage/orc/OrcScanner.java | 2 + .../thirdparty/orc/TreeReaderFactory.java | 47 ++++++++++--------- 4 files changed, 30 insertions(+), 23 deletions(-) diff --git a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml index 7f4661b451..2454714452 100644 --- a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml +++ b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml @@ -130,7 +130,7 @@ tajo.storage.scanner-handler.orc.class - org.apache.tajo.storage.orc.ORCScanner + org.apache.tajo.storage.orc.OrcScanner diff --git a/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml b/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml index 934dd01f24..1c4530a3cd 100644 --- a/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml +++ b/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml @@ -132,7 +132,7 @@ tajo.storage.scanner-handler.orc.class - org.apache.tajo.storage.orc.ORCScanner + org.apache.tajo.storage.orc.OrcScanner diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 5d9dfac54f..86fe7ad2de 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -32,9 +32,11 @@ import org.apache.orc.Reader.Options; import org.apache.orc.impl.BufferChunk; import org.apache.orc.impl.InStream; +import org.apache.tajo.SessionVars; import org.apache.tajo.TajoConstants; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.TableMeta; +import org.apache.tajo.conf.TajoConf.ConfVars; import org.apache.tajo.plan.expr.EvalNode; import org.apache.tajo.storage.FileScanner; import org.apache.tajo.storage.StorageConstants; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java index b31523f32b..9b3f568fd2 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java @@ -24,7 +24,6 @@ import org.apache.hadoop.io.Text; import org.apache.orc.OrcProto; import org.apache.orc.impl.*; -import org.apache.orc.impl.WriterImpl; import org.apache.tajo.catalog.Column; import org.apache.tajo.catalog.TypeDesc; import org.apache.tajo.datum.Datum; @@ -38,6 +37,7 @@ import java.io.EOFException; import java.io.IOException; import java.io.InputStream; +import java.sql.Timestamp; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.HashMap; @@ -45,6 +45,8 @@ import java.util.Map; import java.util.TimeZone; +import static org.apache.tajo.storage.thirdparty.orc.WriterImpl.BASE_TIMESTAMP_STRING; + public class TreeReaderFactory { private final static Log LOG = LogFactory.getLog(TreeReaderFactory.class); @@ -751,6 +753,7 @@ public static class TimestampTreeReader extends DatumTreeReader { private final TimeZone readerTimeZone; private TimeZone writerTimeZone; private boolean hasSameTZRules; + private final TimeZone timeZone; TimestampTreeReader(TimeZone timeZone, int columnId, boolean skipCorrupt) throws IOException { this(timeZone, columnId, null, null, null, null, skipCorrupt); @@ -765,7 +768,7 @@ protected TimestampTreeReader(TimeZone timeZone, int columnId, InStream presentS this.readerTimeZone = TimeZone.getDefault(); this.writerTimeZone = readerTimeZone; this.hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone); - this.base_timestamp = getBaseTimestamp(timeZone.getID()); + this.base_timestamp = getBaseTimestamp(readerTimeZone.getID()); if (encoding != null) { checkEncoding(encoding); @@ -777,6 +780,7 @@ protected TimestampTreeReader(TimeZone timeZone, int columnId, InStream presentS this.nanos = createIntegerReader(encoding.getKind(), nanosStream, false, skipCorrupt); } } + this.timeZone = timeZone; } @Override @@ -800,6 +804,7 @@ void startStripe(Map streams, streams.get(new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.SECONDARY)), false, skipCorrupt); base_timestamp = getBaseTimestamp(stripeFooter.getWriterTimezone()); + this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / DateTimeConstants.MSECS_PER_SEC; } private long getBaseTimestamp(String timeZoneId) throws IOException { @@ -814,8 +819,7 @@ private long getBaseTimestamp(String timeZoneId) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); sdf.setTimeZone(writerTimeZone); try { - long epoch = - sdf.parse(org.apache.orc.impl.WriterImpl.BASE_TIMESTAMP_STRING).getTime() / WriterImpl.MILLIS_PER_SECOND; + long epoch = sdf.parse(BASE_TIMESTAMP_STRING).getTime() / DateTimeConstants.MSECS_PER_SEC; baseTimestampMap.put(timeZoneId, epoch); return epoch; } catch (ParseException e) { @@ -846,23 +850,24 @@ Datum next() throws IOException { if (valuePresent) { long millis = decodeTimestamp(data.next(), nanos.next(), base_timestamp); - long offset = 0; - // If reader and writer time zones have different rules, adjust the timezone difference - // between reader and writer taking day light savings into account. - if (!hasSameTZRules) { - offset = writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(millis); - } - long adjustedMillis = millis + offset; - - // Sometimes the reader timezone might have changed after adding the adjustedMillis. - // To account for that change, check for any difference in reader timezone after - // adding adjustedMillis. If so use the new offset (offset at adjustedMillis point of time). - if (!hasSameTZRules && - (readerTimeZone.getOffset(millis) != readerTimeZone.getOffset(adjustedMillis))) { - long newOffset = - writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(adjustedMillis); - adjustedMillis = millis + newOffset; - } + long adjustedMillis = millis - timeZone.getRawOffset(); +// long offset = 0; +// // If reader and writer time zones have different rules, adjust the timezone difference +// // between reader and writer taking day light savings into account. +// if (!hasSameTZRules) { +// offset = writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(millis); +// } +// long adjustedMillis = millis + offset; +// +// // Sometimes the reader timezone might have changed after adding the adjustedMillis. +// // To account for that change, check for any difference in reader timezone after +// // adding adjustedMillis. If so use the new offset (offset at adjustedMillis point of time). +// if (!hasSameTZRules && +// (readerTimeZone.getOffset(millis) != readerTimeZone.getOffset(adjustedMillis))) { +// long newOffset = +// writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(adjustedMillis); +// adjustedMillis = millis + newOffset; +// } return DatumFactory.createTimestamp(DateTimeUtil.javaTimeToJulianTime(adjustedMillis)); } else { return NullDatum.get(); From 70f846477044b1fbc25bdde5a852d9dccd51ee33 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Sun, 20 Mar 2016 21:37:36 +0900 Subject: [PATCH 06/16] Add some tests --- .../tajo-catalog-drivers/tajo-hive/pom.xml | 6 +- .../tajo/catalog/store/HiveCatalogStore.java | 3 +- .../tajo/catalog/store/HiveCatalogUtil.java | 3 + .../tajo/engine/query/TestSelectQuery.java | 19 ----- .../tajo/storage/TestQueryOnOrcFile.java | 79 +++++++++++++++++++ .../timezoned/timezoned1.tbl} | 0 .../datetime_table_timezoned_ddl.sql | 5 ++ .../datetime_table_timezoned_orc_ddl.sql | 0 .../TestQueryOnOrcFile/testTimezone1.sql | 1 + .../TestSelectQuery/testTimezonedORCTable.sql | 2 - .../testTimezone1.result} | 0 .../TestQueryOnOrcFile/testTimezone2.result | 5 ++ .../TestQueryOnOrcFile/testTimezone3.result | 5 ++ .../TestQueryOnOrcFile/testTimezone4.result | 5 ++ tajo-dist/pom.xml | 9 +-- tajo-project/pom.xml | 1 + tajo-storage/tajo-storage-hdfs/pom.xml | 26 +++++- .../apache/tajo/storage/orc/OrcScanner.java | 7 +- 18 files changed, 136 insertions(+), 40 deletions(-) create mode 100644 tajo-core-tests/src/test/java/org/apache/tajo/storage/TestQueryOnOrcFile.java rename tajo-core-tests/src/test/resources/dataset/{TestSelectQuery/timezoned/table1.tbl => TestQueryOnOrcFile/timezoned/timezoned1.tbl} (100%) create mode 100644 tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/datetime_table_timezoned_ddl.sql rename tajo-core-tests/src/test/resources/queries/{TestSelectQuery => TestQueryOnOrcFile}/datetime_table_timezoned_orc_ddl.sql (100%) create mode 100644 tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/testTimezone1.sql delete mode 100644 tajo-core-tests/src/test/resources/queries/TestSelectQuery/testTimezonedORCTable.sql rename tajo-core-tests/src/test/resources/results/{TestSelectQuery/testTimezonedORCTable.result => TestQueryOnOrcFile/testTimezone1.result} (100%) create mode 100644 tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone2.result create mode 100644 tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone3.result create mode 100644 tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone4.result diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml b/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml index 1a8a188b79..a1e0c98b57 100644 --- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml +++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml @@ -33,8 +33,6 @@ UTF-8 UTF-8 - 1.5.0 - 2.1.0 @@ -279,8 +277,8 @@ - com.twitter - parquet-hive-bundle + org.apache.parquet + parquet-hadoop-bundle ${parquet.version} diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java index 1d0d261d12..cad3c2414a 100644 --- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java +++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java @@ -38,6 +38,7 @@ import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe; import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.parquet.hadoop.ParquetOutputFormat; import org.apache.tajo.BuiltinStorages; import org.apache.tajo.TajoConstants; import org.apache.tajo.algebra.Expr; @@ -57,10 +58,8 @@ import org.apache.tajo.storage.StorageConstants; import org.apache.tajo.util.KeyValueSet; import org.apache.thrift.TException; -import parquet.hadoop.ParquetOutputFormat; import java.io.File; -import java.io.IOException; import java.util.*; public class HiveCatalogStore extends CatalogConstants implements CatalogStore { diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogUtil.java b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogUtil.java index bbb7adeee3..87b391ea60 100644 --- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogUtil.java +++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogUtil.java @@ -22,6 +22,7 @@ import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.ql.io.RCFileInputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcSerde; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.serde.serdeConstants; @@ -137,6 +138,8 @@ public static String getDataFormat(StorageDescriptor descriptor) { return BuiltinStorages.PARQUET; } else if (AvroSerDe.class.getName().equals(serde)) { return BuiltinStorages.AVRO; + } else if (OrcSerde.class.getName().equals(serde)) { + return BuiltinStorages.ORC; } else { throw new TajoRuntimeException(new UnknownDataFormatException(inputFormat)); } diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java index e55acf1fc0..a2dec50e91 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java @@ -682,25 +682,6 @@ public void testLoadIntoTimezonedTable() throws Exception { executeString("DROP TABLE IF EXISTS timezoned_load2 PURGE"); } } - - @Test - public void testTimezonedORCTable() throws Exception { - try { - - executeDDL("datetime_table_timezoned_ddl.sql", "timezoned", "timezoned"); - executeDDL("datetime_table_timezoned_orc_ddl.sql", null, "timezoned_orc"); - - executeString("INSERT OVERWRITE INTO timezoned_orc SELECT t_timestamp, t_date FROM timezoned"); - - ResultSet res = executeQuery(); - assertResultSet(res, "testTimezonedORCTable.result"); - executeString("SET TIME ZONE 'GMT'"); - cleanupQuery(res); - } finally { - executeString("DROP TABLE IF EXISTS timezoned"); - executeString("DROP TABLE IF EXISTS timezoned_orc PURGE"); - } - } @Test public void testMultiBytesDelimiter1() throws Exception { diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/storage/TestQueryOnOrcFile.java b/tajo-core-tests/src/test/java/org/apache/tajo/storage/TestQueryOnOrcFile.java new file mode 100644 index 0000000000..29d132e35f --- /dev/null +++ b/tajo-core-tests/src/test/java/org/apache/tajo/storage/TestQueryOnOrcFile.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage; + +import org.apache.tajo.IntegrationTest; +import org.apache.tajo.QueryTestCaseBase; +import org.junit.*; +import org.junit.experimental.categories.Category; + +import java.sql.ResultSet; + +@Category(IntegrationTest.class) +public class TestQueryOnOrcFile extends QueryTestCaseBase { + + @Before + public void setup() throws Exception { + executeDDL("datetime_table_timezoned_ddl.sql", "timezoned", "timezoned"); + executeDDL("datetime_table_timezoned_orc_ddl.sql", null, "timezoned_orc"); + + executeString("INSERT OVERWRITE INTO timezoned_orc SELECT t_timestamp, t_date FROM timezoned"); + } + + @After + public void teardown() throws Exception { + executeString("DROP TABLE IF EXISTS timezoned"); + executeString("DROP TABLE IF EXISTS timezoned_orc PURGE"); + } + + @Test + public void testTimezone1() throws Exception { + executeString("SET TIME ZONE 'GMT+9'"); + ResultSet res = executeQuery(); + assertResultSet(res); + executeString("SET TIME ZONE 'GMT'"); + cleanupQuery(res); + } + + @Test + public void testTimezone2() throws Exception { + executeString("SET TIME ZONE 'GMT+1'"); + ResultSet res = executeString("select * from timezoned_orc"); + assertResultSet(res); + executeString("SET TIME ZONE 'GMT'"); + cleanupQuery(res); + } + + @Test + public void testTimezone3() throws Exception { + executeString("SET TIME ZONE 'GMT'"); + ResultSet res = executeString("select * from timezoned_orc"); + assertResultSet(res); + cleanupQuery(res); + } + + @Test + public void testTimezone4() throws Exception { + executeString("\\set TIMEZONE 'GMT-5'"); + ResultSet res = executeString("select * from timezoned_orc"); + assertResultSet(res); + executeString("SET TIME ZONE 'GMT'"); + cleanupQuery(res); + } +} diff --git a/tajo-core-tests/src/test/resources/dataset/TestSelectQuery/timezoned/table1.tbl b/tajo-core-tests/src/test/resources/dataset/TestQueryOnOrcFile/timezoned/timezoned1.tbl similarity index 100% rename from tajo-core-tests/src/test/resources/dataset/TestSelectQuery/timezoned/table1.tbl rename to tajo-core-tests/src/test/resources/dataset/TestQueryOnOrcFile/timezoned/timezoned1.tbl diff --git a/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/datetime_table_timezoned_ddl.sql b/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/datetime_table_timezoned_ddl.sql new file mode 100644 index 0000000000..9c5d30d22c --- /dev/null +++ b/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/datetime_table_timezoned_ddl.sql @@ -0,0 +1,5 @@ +CREATE EXTERNAL TABLE ${0} ( + t_timestamp TIMESTAMP, + t_time TIME, + t_date DATE +) USING TEXT WITH ('timezone' = 'GMT+9') LOCATION ${table.path} diff --git a/tajo-core-tests/src/test/resources/queries/TestSelectQuery/datetime_table_timezoned_orc_ddl.sql b/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/datetime_table_timezoned_orc_ddl.sql similarity index 100% rename from tajo-core-tests/src/test/resources/queries/TestSelectQuery/datetime_table_timezoned_orc_ddl.sql rename to tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/datetime_table_timezoned_orc_ddl.sql diff --git a/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/testTimezone1.sql b/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/testTimezone1.sql new file mode 100644 index 0000000000..2464c974ae --- /dev/null +++ b/tajo-core-tests/src/test/resources/queries/TestQueryOnOrcFile/testTimezone1.sql @@ -0,0 +1 @@ +SELECT * FROM timezoned_orc; \ No newline at end of file diff --git a/tajo-core-tests/src/test/resources/queries/TestSelectQuery/testTimezonedORCTable.sql b/tajo-core-tests/src/test/resources/queries/TestSelectQuery/testTimezonedORCTable.sql deleted file mode 100644 index 1d898bd73c..0000000000 --- a/tajo-core-tests/src/test/resources/queries/TestSelectQuery/testTimezonedORCTable.sql +++ /dev/null @@ -1,2 +0,0 @@ -SET SESSION TIMEZONE = 'GMT+9'; -SELECT * FROM timezoned_orc; \ No newline at end of file diff --git a/tajo-core-tests/src/test/resources/results/TestSelectQuery/testTimezonedORCTable.result b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone1.result similarity index 100% rename from tajo-core-tests/src/test/resources/results/TestSelectQuery/testTimezonedORCTable.result rename to tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone1.result diff --git a/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone2.result b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone2.result new file mode 100644 index 0000000000..c0e5ceffe1 --- /dev/null +++ b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone2.result @@ -0,0 +1,5 @@ +t_timestamp,t_date +------------------------------- +1980-03-31 17:50:30.01,1980-04-01 +1980-03-31 17:50:30,1980-04-01 +1980-03-31 17:50:30,1980-04-01 \ No newline at end of file diff --git a/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone3.result b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone3.result new file mode 100644 index 0000000000..916f4be8dd --- /dev/null +++ b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone3.result @@ -0,0 +1,5 @@ +t_timestamp,t_date +------------------------------- +1980-03-31 16:50:30.01,1980-04-01 +1980-03-31 16:50:30,1980-04-01 +1980-03-31 16:50:30,1980-04-01 \ No newline at end of file diff --git a/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone4.result b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone4.result new file mode 100644 index 0000000000..98e0918610 --- /dev/null +++ b/tajo-core-tests/src/test/resources/results/TestQueryOnOrcFile/testTimezone4.result @@ -0,0 +1,5 @@ +t_timestamp,t_date +------------------------------- +1980-03-31 11:50:30.01,1980-04-01 +1980-03-31 11:50:30,1980-04-01 +1980-03-31 11:50:30,1980-04-01 \ No newline at end of file diff --git a/tajo-dist/pom.xml b/tajo-dist/pom.xml index 095f128809..a91c431a60 100644 --- a/tajo-dist/pom.xml +++ b/tajo-dist/pom.xml @@ -162,13 +162,8 @@ run mkdir -p extlib - if [ -f $ROOT/tajo-catalog/tajo-catalog-drivers/tajo-hive/target/lib/parquet-hive-bundle-*.jar ] - then - run cp -r $ROOT/tajo-catalog/tajo-catalog-drivers/tajo-hive/target/lib/parquet-hive-bundle-*.jar lib/ - echo - echo "Tajo installed parquet-hive-bundle library at: ${project.build.directory}/tajo-${project.version}" - echo - fi + run mkdir -p lib + run cp -r $ROOT/tajo-storage/tajo-storage-hdfs/target/lib/hive-*.jar lib/ echo echo "Tajo dist layout available at: ${project.build.directory}/tajo-${project.version}" diff --git a/tajo-project/pom.xml b/tajo-project/pom.xml index 16e1eb074f..27fa66be32 100644 --- a/tajo-project/pom.xml +++ b/tajo-project/pom.xml @@ -40,6 +40,7 @@ 4.0.34.Final 2.6 6.1.26 + 1.8.1 ${project.parent.relativePath}/.. src/main/hadoop-${hadoop.version} diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index 8688b29279..a3a46fe416 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -34,7 +34,6 @@ UTF-8 UTF-8 - 1.8.1 @@ -160,6 +159,26 @@ org.apache.maven.plugins maven-surefire-report-plugin + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-dependencies + package + + copy-dependencies + + + runtime + ${project.build.directory}/lib + false + false + true + + + + @@ -348,6 +367,11 @@ hive-orc ${hive.version} + + org.apache.hive + hive-storage-api + ${hive.version} + org.apache.hive hive-serde diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 86fe7ad2de..5b159919de 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -26,17 +26,14 @@ import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.common.io.DiskRange; import org.apache.hadoop.io.Text; import org.apache.orc.*; import org.apache.orc.Reader.Options; import org.apache.orc.impl.BufferChunk; import org.apache.orc.impl.InStream; -import org.apache.tajo.SessionVars; import org.apache.tajo.TajoConstants; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.TableMeta; -import org.apache.tajo.conf.TajoConf.ConfVars; import org.apache.tajo.plan.expr.EvalNode; import org.apache.tajo.storage.FileScanner; import org.apache.tajo.storage.StorageConstants; @@ -360,7 +357,7 @@ private static OrcProto.Footer extractFooter(ByteBuffer bb, int footerAbsPos, bb.position(footerAbsPos); bb.limit(footerAbsPos + footerSize); return OrcProto.Footer.parseFrom(InStream.createCodedInputStream("footer", - Lists.newArrayList(new BufferChunk(bb, 0)), footerSize, codec, bufferSize)); + Lists.newArrayList(new BufferChunk(bb, 0)), footerSize, codec, bufferSize)); } private static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos, @@ -368,7 +365,7 @@ private static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsP bb.position(metadataAbsPos); bb.limit(metadataAbsPos + metadataSize); return OrcProto.Metadata.parseFrom(InStream.createCodedInputStream("metadata", - Lists.newArrayList(new BufferChunk(bb, 0)), metadataSize, codec, bufferSize)); + Lists.newArrayList(new BufferChunk(bb, 0)), metadataSize, codec, bufferSize)); } /** From 358b9159e4c40349e7a76e873529e9d064eba171 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Sun, 20 Mar 2016 22:16:56 +0900 Subject: [PATCH 07/16] Fix test failure and cleanup hive catalog dependency. --- .../tajo-catalog-drivers/tajo-hive/pom.xml | 192 +++++++++++++----- .../TestSelectQuery/timezoned/timezoned1.tbl | 3 + tajo-dist/pom.xml | 6 +- tajo-storage/tajo-storage-hdfs/pom.xml | 1 + 4 files changed, 147 insertions(+), 55 deletions(-) create mode 100644 tajo-core-tests/src/test/resources/dataset/TestSelectQuery/timezoned/timezoned1.tbl diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml b/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml index a1e0c98b57..d8484613fb 100644 --- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml +++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/pom.xml @@ -134,19 +134,35 @@ org.apache.hadoop - hadoop-mapreduce-client-core + hadoop-common ${hadoop.version} provided + + + zookeeper + org.apache.zookeeper + + org.apache.hadoop - hadoop-common + hadoop-mapreduce-client-core ${hadoop.version} provided + + + hadoop-yarn-common + org.apache.hadoop + + + netty + io.netty + + org.apache.hive - hive-exec + hive-metastore ${hive.version} provided @@ -156,123 +172,195 @@ org.apache.hive - hive-contrib + hive-serde org.apache.hive - hive-hbase-handler + hive-shimss - org.apache.hive - hive-metastore + org.apache.thrift + libfb303 - org.apache.hive - hive-serde + org.apache.thrift + libthrift - org.apache.hive - hive-shims + com.jolbox + bonecp - org.apache.hive - hive-testutils + tephra-hbase-compat-1.0 + co.cask.tephra - org.apache.thrift - libfb303 + tephra-core + co.cask.tephra - org.apache.thrift - libthrift + tephra-api + co.cask.tephra - com.jolbox - bonecp + hbase-client + org.apache.hbase - com.google.protobuf - protobuf-java + hadoop-yarn-server-resourcemanager + org.apache.hadoop - org.apache.calcite - calcite-core + antlr-runtime + org.antlr - org.apache.calcite - calcite-avatica + log4j-slf4j-impl + org.apache.logging.log4j + + + zookeeper + org.apache.zookeeper org.apache.hive - hive-metastore + hive-common ${hive.version} provided - org.apache.hive - hive-common + jetty-all + org.eclipse.jetty.aggregate - org.apache.hive - hive-serde + javax.servlet + org.eclipse.jetty.orbit - org.apache.hive - hive-shimss + joda-time + joda-time - org.apache.thrift - libfb303 + jackson-databind + com.fasterxml.jackson.core - org.apache.thrift - libthrift + metrics-json + io.dropwizard.metrics - com.jolbox - bonecp + metrics-jvm + io.dropwizard.metrics + + + metrics-core + io.dropwizard.metrics + + + ant + org.apache.ant + + + json + org.json + + + log4j-slf4j-impl + org.apache.logging.log4j + + + log4j-web + org.apache.logging.log4j + + + log4j-1.2-api + org.apache.logging.log4j org.apache.hive - hive-cli + hive-exec ${hive.version} provided + hive-ant org.apache.hive - hive-common + hive-llap-tez org.apache.hive - hive-exec - org.apache.hive - hive-metastore + ST4 + org.antlr - org.apache.hive - hive-serde + ivy + org.apache.ivy - org.apache.hive - hive-service + curator-framework + org.apache.curator - org.apache.hive - hive-shims + apache-curator + org.apache.curator - com.jolbox - bonecp + groovy-all + org.codehaus.groovy + + + calcite-core + org.apache.calcite + + + calcite-avatica + org.apache.calcite + + + stax-api + stax - jline jline + jline + + + log4j-1.2-api + org.apache.logging.log4j + + + log4j-slf4j-impl + org.apache.logging.log4j + + + ant + org.apache.ant + + + zookeeper + org.apache.zookeeper + + + antlr-runtime + org.antlr + + + + + org.apache.hive + hive-serde + ${hive.version} + provided + + + opencsv + net.sf.opencsv diff --git a/tajo-core-tests/src/test/resources/dataset/TestSelectQuery/timezoned/timezoned1.tbl b/tajo-core-tests/src/test/resources/dataset/TestSelectQuery/timezoned/timezoned1.tbl new file mode 100644 index 0000000000..74b2e1b273 --- /dev/null +++ b/tajo-core-tests/src/test/resources/dataset/TestSelectQuery/timezoned/timezoned1.tbl @@ -0,0 +1,3 @@ +1980-4-1 01:50:30.010|01:50:30.010|1980-04-01 +80/4/1 1:50:30 AM|1:50:30 AM|80/4/1 +1980 April 1 1:50:30|1:50:30|1980-04-01 \ No newline at end of file diff --git a/tajo-dist/pom.xml b/tajo-dist/pom.xml index a91c431a60..b742f5eccf 100644 --- a/tajo-dist/pom.xml +++ b/tajo-dist/pom.xml @@ -154,6 +154,9 @@ run cp -r ${project.basedir}/src/main/conf . run rm -rf lib/tajo-*-${project.version}.jar + run mkdir -p lib + run cp -r $ROOT/tajo-storage/tajo-storage-hdfs/target/lib/hive-*.jar lib/ + run mkdir hive run mv lib/hive-*.jar hive/ @@ -162,9 +165,6 @@ run mkdir -p extlib - run mkdir -p lib - run cp -r $ROOT/tajo-storage/tajo-storage-hdfs/target/lib/hive-*.jar lib/ - echo echo "Tajo dist layout available at: ${project.build.directory}/tajo-${project.version}" echo diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index a3a46fe416..2a0396a529 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -376,6 +376,7 @@ org.apache.hive hive-serde ${hive.version} + provided log4j-slf4j-impl From a79fa2920de301c1e81bc7c5568f4e1c35650429 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 21 Mar 2016 09:42:37 +0900 Subject: [PATCH 08/16] fix test failure --- tajo-storage/tajo-storage-hdfs/pom.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index 2a0396a529..a3a46fe416 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -376,7 +376,6 @@ org.apache.hive hive-serde ${hive.version} - provided log4j-slf4j-impl From b67427fc277dc5dde3498bf4b2856a8260afc069 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 21 Mar 2016 15:07:06 +0900 Subject: [PATCH 09/16] Remove hive shims dependency --- tajo-storage/tajo-storage-hdfs/pom.xml | 70 +++---- .../apache/tajo/storage/orc/ORCAppender.java | 6 - .../ObjectInspectorFactory.java | 91 --------- .../TajoBlobObjectInspector.java | 82 -------- .../TajoBooleanObjectInspector.java | 76 -------- .../TajoDateObjectInspector.java | 73 -------- .../TajoDoubleObjectInspector.java | 76 -------- .../TajoFloatObjectInspector.java | 76 -------- .../TajoIntObjectInspector.java | 76 -------- .../TajoLongObjectInspector.java | 76 -------- .../TajoNullObjectInspector.java | 69 ------- .../TajoShortObjectInspector.java | 76 -------- .../TajoStringObjectInspector.java | 71 ------- .../TajoStructObjectInspector.java | 122 ------------ .../TajoTimestampObjectInspector.java | 73 -------- .../orc/ByteBufferAllocatorPool.java | 102 ++++++++++ .../orc/ByteBufferPoolAdapter.java} | 25 +-- .../tajo/storage/thirdparty/orc/OrcUtils.java | 175 ------------------ .../thirdparty/orc/RecordReaderUtils.java | 96 +--------- .../thirdparty/orc/ZeroCopyAdapter.java | 57 ++++++ 20 files changed, 213 insertions(+), 1355 deletions(-) delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDateObjectInspector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoNullObjectInspector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoTimestampObjectInspector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ByteBufferAllocatorPool.java rename tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/{orc/objectinspector/TajoPrimitiveObjectInspector.java => thirdparty/orc/ByteBufferPoolAdapter.java} (62%) create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZeroCopyAdapter.java diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index a3a46fe416..f940f5b681 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -372,41 +372,41 @@ hive-storage-api ${hive.version} - - org.apache.hive - hive-serde - ${hive.version} - - - log4j-slf4j-impl - org.apache.logging.log4j - - - log4j-1.2-api - org.apache.logging.log4j - - - hive-common - org.apache.hive - - - libthrift - org.apache.thrift - - - opencsv - net.sf.opencsv - - - hadoop-yarn-server-resourcemanager - org.apache.hadoop - - - hive-shims-scheduler - org.apache.hive.shims - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java index b283b2219b..ebdfa3224e 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java @@ -20,7 +20,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.orc.CompressionKind; import org.apache.orc.OrcConf; import org.apache.orc.TypeDescription; @@ -152,7 +151,6 @@ private static CompressionKind getCompressionKind(TableMeta meta) { */ public static class WriterOptions extends OrcFile.WriterOptions { private boolean explicitSchema = false; - private ObjectInspector inspector = null; // Setting the default batch size to 1000 makes the memory check at 5000 // rows work the same as the row by row writer. (If it was the default 1024, // the smallest stripe size would be 5120 rows, which changes the output @@ -179,10 +177,6 @@ protected WriterOptions batchSize(int maxSize) { return this; } - ObjectInspector getInspector() { - return inspector; - } - int getBatchSize() { return batchSize; } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java deleted file mode 100644 index 4855ff9fe3..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java +++ /dev/null @@ -1,91 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.orc.objectinspector; - -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.tajo.catalog.Schema; -import org.apache.tajo.common.TajoDataTypes; -import org.apache.tajo.exception.UnsupportedException; - -public class ObjectInspectorFactory { - - public static StructObjectInspector buildStructObjectInspector(Schema schema) { - StructObjectInspector structOI = new TajoStructObjectInspector(schema); - return structOI; - } - - public static ObjectInspector buildObjectInspectorByType(TajoDataTypes.Type dataType) throws UnsupportedException { - ObjectInspector oi = null; - - switch(dataType) { - case BOOLEAN: - oi = new TajoBooleanObjectInspector(); - break; - - case INT2: - oi = new TajoShortObjectInspector(); - break; - - case INET4: - case INT4: - oi = new TajoIntObjectInspector(); - break; - - case INT8: - oi = new TajoLongObjectInspector(); - break; - - case FLOAT4: - oi = new TajoFloatObjectInspector(); - break; - - case FLOAT8: - oi = new TajoDoubleObjectInspector(); - break; - - case TEXT: - case CHAR: - oi = new TajoStringObjectInspector(); - break; - - case TIMESTAMP: - oi = new TajoTimestampObjectInspector(); - break; - - case DATE: - oi = new TajoDateObjectInspector(); - break; - - case BLOB: - case PROTOBUF: - oi = new TajoBlobObjectInspector(); - break; - - case NULL_TYPE: - oi = new TajoNullObjectInspector(); - break; - - default: - throw new UnsupportedException(dataType.name()+" is not supported yet in ORCAppender"); - } - - return oi; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java deleted file mode 100644 index d241f84371..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java +++ /dev/null @@ -1,82 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.orc.objectinspector; - -import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.hadoop.io.BytesWritable; -import org.apache.tajo.datum.Datum; - -public class TajoBlobObjectInspector extends TajoPrimitiveObjectInspector implements BinaryObjectInspector { - @Override - public PrimitiveTypeInfo getTypeInfo() { - return TypeInfoFactory.binaryTypeInfo; - } - - @Override - public PrimitiveCategory getPrimitiveCategory() { - return PrimitiveCategory.BINARY; - } - - @Override - public Class getPrimitiveWritableClass() { - return null; - } - - @Override - public BytesWritable getPrimitiveWritableObject(Object o) { - return null; - } - - @Override - public Class getJavaPrimitiveClass() { - return byte [].class; - } - - @Override - public byte[] getPrimitiveJavaObject(Object o) { - return ((Datum)o).asByteArray(); - } - - @Override - public Object copyObject(Object o) { - return null; - } - - @Override - public boolean preferWritable() { - return false; - } - - @Override - public int precision() { - return 0; - } - - @Override - public int scale() { - return 0; - } - - @Override - public String getTypeName() { - return "BINARY"; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java deleted file mode 100644 index 273505f0cb..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.orc.objectinspector; - -import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.tajo.datum.Datum; - -public class TajoBooleanObjectInspector extends TajoPrimitiveObjectInspector implements BooleanObjectInspector { - @Override - public boolean get(Object o) { - return ((Datum)o).asBool(); - } - - @Override - public PrimitiveTypeInfo getTypeInfo() { - return TypeInfoFactory.booleanTypeInfo; - } - - @Override - public PrimitiveCategory getPrimitiveCategory() { - return PrimitiveCategory.BOOLEAN; - } - - @Override - public Class getPrimitiveWritableClass() { - return null; - } - - @Override - public Object getPrimitiveWritableObject(Object o) { - return null; - } - - @Override - public Class getJavaPrimitiveClass() { - return Boolean.class; - } - - @Override - public Object getPrimitiveJavaObject(Object o) { - return null; - } - - @Override - public Object copyObject(Object o) { - return null; - } - - @Override - public boolean preferWritable() { - return false; - } - - @Override - public String getTypeName() { - return "BOOLEAN"; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDateObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDateObjectInspector.java deleted file mode 100644 index f12706b8df..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDateObjectInspector.java +++ /dev/null @@ -1,73 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.orc.objectinspector; - -import org.apache.hadoop.hive.serde2.io.DateWritable; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; - -import java.sql.Date; - -public class TajoDateObjectInspector extends TajoPrimitiveObjectInspector implements DateObjectInspector { - @Override - public PrimitiveTypeInfo getTypeInfo() { - return TypeInfoFactory.dateTypeInfo; - } - - @Override - public PrimitiveCategory getPrimitiveCategory() { - return PrimitiveCategory.DATE; - } - - @Override - public Class getPrimitiveWritableClass() { - return null; - } - - @Override - public DateWritable getPrimitiveWritableObject(Object o) { - return null; - } - - @Override - public Class getJavaPrimitiveClass() { - return null; - } - - @Override - public Date getPrimitiveJavaObject(Object o) { - return null; - } - - @Override - public Object copyObject(Object o) { - return null; - } - - @Override - public boolean preferWritable() { - return false; - } - - @Override - public String getTypeName() { - return "DATE"; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java deleted file mode 100644 index 6dc1f8c95c..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.orc.objectinspector; - -import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.tajo.datum.Float8Datum; - -public class TajoDoubleObjectInspector extends TajoPrimitiveObjectInspector implements DoubleObjectInspector { - @Override - public double get(Object o) { - return ((Float8Datum)o).asFloat8(); - } - - @Override - public PrimitiveTypeInfo getTypeInfo() { - return TypeInfoFactory.doubleTypeInfo; - } - - @Override - public PrimitiveCategory getPrimitiveCategory() { - return PrimitiveCategory.DOUBLE; - } - - @Override - public Class getPrimitiveWritableClass() { - return null; - } - - @Override - public Object getPrimitiveWritableObject(Object o) { - return null; - } - - @Override - public Class getJavaPrimitiveClass() { - return Double.class; - } - - @Override - public Object getPrimitiveJavaObject(Object o) { - return null; - } - - @Override - public Object copyObject(Object o) { - return null; - } - - @Override - public boolean preferWritable() { - return false; - } - - @Override - public String getTypeName() { - return "DOUBLE"; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java deleted file mode 100644 index bed8784fb5..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.orc.objectinspector; - -import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.tajo.datum.Float4Datum; - -public class TajoFloatObjectInspector extends TajoPrimitiveObjectInspector implements DoubleObjectInspector { - @Override - public double get(Object o) { - return ((Float4Datum)o).asFloat4(); - } - - @Override - public PrimitiveTypeInfo getTypeInfo() { - return TypeInfoFactory.floatTypeInfo; - } - - @Override - public PrimitiveCategory getPrimitiveCategory() { - return PrimitiveCategory.FLOAT; - } - - @Override - public Class getPrimitiveWritableClass() { - return null; - } - - @Override - public Object getPrimitiveWritableObject(Object o) { - return null; - } - - @Override - public Class getJavaPrimitiveClass() { - return Float.class; - } - - @Override - public Object getPrimitiveJavaObject(Object o) { - return null; - } - - @Override - public Object copyObject(Object o) { - return null; - } - - @Override - public boolean preferWritable() { - return false; - } - - @Override - public String getTypeName() { - return "FLOAT"; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java deleted file mode 100644 index a0c2209678..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.orc.objectinspector; - -import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.tajo.datum.Int4Datum; - -public class TajoIntObjectInspector extends TajoPrimitiveObjectInspector implements IntObjectInspector { - @Override - public int get(Object o) { - return ((Int4Datum)o).asInt4(); - } - - @Override - public PrimitiveTypeInfo getTypeInfo() { - return TypeInfoFactory.intTypeInfo; - } - - @Override - public PrimitiveCategory getPrimitiveCategory() { - return PrimitiveCategory.INT; - } - - @Override - public Class getPrimitiveWritableClass() { - return null; - } - - @Override - public Object getPrimitiveWritableObject(Object o) { - return null; - } - - @Override - public Class getJavaPrimitiveClass() { - return Integer.class; - } - - @Override - public Object getPrimitiveJavaObject(Object o) { - return null; - } - - @Override - public Object copyObject(Object o) { - return null; - } - - @Override - public boolean preferWritable() { - return false; - } - - @Override - public String getTypeName() { - return "INT"; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java deleted file mode 100644 index b30b3338f6..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.orc.objectinspector; - -import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.tajo.datum.Int8Datum; - -public class TajoLongObjectInspector extends TajoPrimitiveObjectInspector implements LongObjectInspector { - @Override - public long get(Object o) { - return ((Int8Datum)o).asInt8(); - } - - @Override - public PrimitiveTypeInfo getTypeInfo() { - return TypeInfoFactory.shortTypeInfo; - } - - @Override - public PrimitiveCategory getPrimitiveCategory() { - return PrimitiveCategory.LONG; - } - - @Override - public Class getPrimitiveWritableClass() { - return null; - } - - @Override - public Object getPrimitiveWritableObject(Object o) { - return null; - } - - @Override - public Class getJavaPrimitiveClass() { - return Long.class; - } - - @Override - public Object getPrimitiveJavaObject(Object o) { - return null; - } - - @Override - public Object copyObject(Object o) { - return null; - } - - @Override - public boolean preferWritable() { - return false; - } - - @Override - public String getTypeName() { - return "LONG"; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoNullObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoNullObjectInspector.java deleted file mode 100644 index 49998ce30e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoNullObjectInspector.java +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.orc.objectinspector; - -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; - -public class TajoNullObjectInspector extends TajoPrimitiveObjectInspector { - @Override - public PrimitiveTypeInfo getTypeInfo() { - return TypeInfoFactory.voidTypeInfo; - } - - @Override - public PrimitiveCategory getPrimitiveCategory() { - return PrimitiveCategory.VOID; - } - - @Override - public Class getPrimitiveWritableClass() { - return null; - } - - @Override - public Object getPrimitiveWritableObject(Object o) { - return null; - } - - @Override - public Class getJavaPrimitiveClass() { - return Void.class; - } - - @Override - public Object getPrimitiveJavaObject(Object o) { - return null; - } - - @Override - public Object copyObject(Object o) { - return null; - } - - @Override - public boolean preferWritable() { - return false; - } - - @Override - public String getTypeName() { - return "NULL"; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java deleted file mode 100644 index d32bee172a..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.orc.objectinspector; - -import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.tajo.datum.Int2Datum; - -public class TajoShortObjectInspector extends TajoPrimitiveObjectInspector implements ShortObjectInspector { - @Override - public short get(Object o) { - return ((Int2Datum)o).asInt2(); - } - - @Override - public PrimitiveTypeInfo getTypeInfo() { - return TypeInfoFactory.shortTypeInfo; - } - - @Override - public PrimitiveCategory getPrimitiveCategory() { - return PrimitiveCategory.SHORT; - } - - @Override - public Class getPrimitiveWritableClass() { - return null; - } - - @Override - public Object getPrimitiveWritableObject(Object o) { - return null; - } - - @Override - public Class getJavaPrimitiveClass() { - return Short.class; - } - - @Override - public Object getPrimitiveJavaObject(Object o) { - return null; - } - - @Override - public Object copyObject(Object o) { - return null; - } - - @Override - public boolean preferWritable() { - return false; - } - - @Override - public String getTypeName() { - return "SHORT"; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java deleted file mode 100644 index b9331da6cd..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java +++ /dev/null @@ -1,71 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.orc.objectinspector; - -import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.hadoop.io.Text; - -public class TajoStringObjectInspector extends TajoPrimitiveObjectInspector implements StringObjectInspector { - @Override - public PrimitiveTypeInfo getTypeInfo() { - return TypeInfoFactory.stringTypeInfo; - } - - @Override - public PrimitiveCategory getPrimitiveCategory() { - return PrimitiveCategory.STRING; - } - - @Override - public Class getPrimitiveWritableClass() { - return null; - } - - @Override - public Text getPrimitiveWritableObject(Object o) { - return null; - } - - @Override - public Class getJavaPrimitiveClass() { - return null; - } - - @Override - public String getPrimitiveJavaObject(Object o) { - return null; - } - - @Override - public Object copyObject(Object o) { - return null; - } - - @Override - public boolean preferWritable() { - return false; - } - - @Override - public String getTypeName() { - return "STRING"; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java deleted file mode 100644 index 7521fa32c6..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java +++ /dev/null @@ -1,122 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.orc.objectinspector; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.tajo.catalog.Column; -import org.apache.tajo.catalog.Schema; -import org.apache.tajo.exception.UnsupportedException; - -import java.util.ArrayList; -import java.util.List; - -public class TajoStructObjectInspector extends StructObjectInspector { - private final static Log LOG = LogFactory.getLog(TajoStructObjectInspector.class); - private List structFields; - - static class TajoStructField implements StructField { - private String name; - private ObjectInspector oi; - private String comment; - - TajoStructField(String name, ObjectInspector oi) { - this(name, oi, null); - } - - TajoStructField(String name, ObjectInspector oi, String comment) { - this.name = name; - this.oi = oi; - this.comment = comment; - } - - @Override - public String getFieldName() { - return name; - } - - @Override - public ObjectInspector getFieldObjectInspector() { - return oi; - } - - @Override - public int getFieldID() { - return 0; - } - - @Override - public String getFieldComment() { - return comment; - } - } - - TajoStructObjectInspector(Schema schema) { - structFields = new ArrayList<>(schema.size()); - - for (Column c: schema.getRootColumns()) { - try { - TajoStructField field = new TajoStructField(c.getSimpleName(), - ObjectInspectorFactory.buildObjectInspectorByType(c.getDataType().getType())); - structFields.add(field); - } catch (UnsupportedException e) { - LOG.error(e.getMessage()); - } - } - } - - @Override - public List getAllStructFieldRefs() { - return structFields; - } - - @Override - public StructField getStructFieldRef(String s) { - for (TajoStructField field:structFields) { - if (field.getFieldName().equals(s)) { - return field; - } - } - - return null; - } - - @Override - public Object getStructFieldData(Object o, StructField structField) { - return null; - } - - @Override - public List getStructFieldsDataAsList(Object o) { - return null; - } - - @Override - public String getTypeName() { - return "STRUCT"; - } - - @Override - public Category getCategory() { - return Category.STRUCT; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoTimestampObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoTimestampObjectInspector.java deleted file mode 100644 index bb887e79da..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoTimestampObjectInspector.java +++ /dev/null @@ -1,73 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.orc.objectinspector; - -import org.apache.hadoop.hive.serde2.io.TimestampWritable; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; - -import java.sql.Timestamp; - -public class TajoTimestampObjectInspector extends TajoPrimitiveObjectInspector implements TimestampObjectInspector { - @Override - public PrimitiveTypeInfo getTypeInfo() { - return TypeInfoFactory.timestampTypeInfo; - } - - @Override - public PrimitiveCategory getPrimitiveCategory() { - return PrimitiveCategory.TIMESTAMP; - } - - @Override - public Class getPrimitiveWritableClass() { - return null; - } - - @Override - public TimestampWritable getPrimitiveWritableObject(Object o) { - return null; - } - - @Override - public Class getJavaPrimitiveClass() { - return null; - } - - @Override - public Timestamp getPrimitiveJavaObject(Object o) { - return null; - } - - @Override - public Object copyObject(Object o) { - return null; - } - - @Override - public boolean preferWritable() { - return false; - } - - @Override - public String getTypeName() { - return "TIMESTAMP"; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ByteBufferAllocatorPool.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ByteBufferAllocatorPool.java new file mode 100644 index 0000000000..de60bb284a --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ByteBufferAllocatorPool.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.collect.ComparisonChain; +import org.apache.commons.lang.builder.HashCodeBuilder; + +import java.nio.ByteBuffer; +import java.util.Map; +import java.util.TreeMap; + +public class ByteBufferAllocatorPool { + private static final class Key implements Comparable { + private final int capacity; + private final long insertionGeneration; + + Key(int capacity, long insertionGeneration) { + this.capacity = capacity; + this.insertionGeneration = insertionGeneration; + } + + @Override + public int compareTo(Key other) { + return ComparisonChain.start().compare(capacity, other.capacity) + .compare(insertionGeneration, other.insertionGeneration).result(); + } + + @Override + public boolean equals(Object rhs) { + if (rhs == null) { + return false; + } + try { + Key o = (Key) rhs; + return (compareTo(o) == 0); + } catch (ClassCastException e) { + return false; + } + } + + @Override + public int hashCode() { + return new HashCodeBuilder().append(capacity).append(insertionGeneration) + .toHashCode(); + } + } + + private final TreeMap buffers = new TreeMap(); + + private final TreeMap directBuffers = new TreeMap(); + + private long currentGeneration = 0; + + private final TreeMap getBufferTree(boolean direct) { + return direct ? directBuffers : buffers; + } + + public void clear() { + buffers.clear(); + directBuffers.clear(); + } + + public ByteBuffer getBuffer(boolean direct, int length) { + TreeMap tree = getBufferTree(direct); + Map.Entry entry = tree.ceilingEntry(new Key(length, 0)); + if (entry == null) { + return direct ? ByteBuffer.allocateDirect(length) : ByteBuffer + .allocate(length); + } + tree.remove(entry.getKey()); + return entry.getValue(); + } + + public void putBuffer(ByteBuffer buffer) { + TreeMap tree = getBufferTree(buffer.isDirect()); + while (true) { + Key key = new Key(buffer.capacity(), currentGeneration++); + if (!tree.containsKey(key)) { + tree.put(key, buffer); + return; + } + // Buffers are indexed by (capacity, generation). + // If our key is not unique on the first try, we try again + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoPrimitiveObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ByteBufferPoolAdapter.java similarity index 62% rename from tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoPrimitiveObjectInspector.java rename to tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ByteBufferPoolAdapter.java index 90ac178fdd..2e9aec11ad 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoPrimitiveObjectInspector.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ByteBufferPoolAdapter.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -16,23 +16,26 @@ * limitations under the License. */ -package org.apache.tajo.storage.orc.objectinspector; +package org.apache.tajo.storage.thirdparty.orc; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.io.ByteBufferPool; -public abstract class TajoPrimitiveObjectInspector implements PrimitiveObjectInspector { - @Override - public Category getCategory() { - return Category.PRIMITIVE; +import java.nio.ByteBuffer; + +public class ByteBufferPoolAdapter implements ByteBufferPool { + private ByteBufferAllocatorPool pool; + + public ByteBufferPoolAdapter(ByteBufferAllocatorPool pool) { + this.pool = pool; } @Override - public int precision() { - return 0; + public final ByteBuffer getBuffer(boolean direct, int length) { + return this.pool.getBuffer(direct, length); } @Override - public int scale() { - return 0; + public final void putBuffer(ByteBuffer buffer) { + this.pool.putBuffer(buffer); } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java index cc0e08e20a..b8d3f52c67 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java @@ -17,10 +17,8 @@ */ package org.apache.tajo.storage.thirdparty.orc; -import com.google.common.collect.Lists; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hive.serde2.objectinspector.*; import org.apache.orc.CompressionCodec; import org.apache.orc.TypeDescription; import org.apache.orc.impl.SnappyCodec; @@ -31,182 +29,9 @@ import org.apache.tajo.exception.TajoRuntimeException; import org.apache.tajo.exception.UnsupportedDataTypeException; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - public class OrcUtils { private static final Log LOG = LogFactory.getLog(OrcUtils.class); - /** - * Returns selected columns as a boolean array with true value set for specified column names. - * The result will contain number of elements equal to flattened number of columns. - * For example: - * selectedColumns - a,b,c - * allColumns - a,b,c,d - * If column c is a complex type, say list and other types are primitives then result will - * be [false, true, true, true, true, true, false] - * Index 0 is the root element of the struct which is set to false by default, index 1,2 - * corresponds to columns a and b. Index 3,4 correspond to column c which is list and - * index 5 correspond to column d. After flattening list gets 2 columns. - * - * @param selectedColumns - comma separated list of selected column names - * @param allColumns - comma separated list of all column names - * @param inspector - object inspector - * @return - boolean array with true value set for the specified column names - */ - public static boolean[] includeColumns(String selectedColumns, String allColumns, - ObjectInspector inspector) { - int numFlattenedCols = getFlattenedColumnsCount(inspector); - boolean[] results = new boolean[numFlattenedCols]; - if ("*".equals(selectedColumns)) { - Arrays.fill(results, true); - return results; - } - if (selectedColumns != null && !selectedColumns.isEmpty()) { - includeColumnsImpl(results, selectedColumns.toLowerCase(), allColumns, inspector); - } - return results; - } - - private static void includeColumnsImpl(boolean[] includeColumns, String selectedColumns, - String allColumns, - ObjectInspector inspector) { - Map> columnSpanMap = getColumnSpan(allColumns, inspector); - LOG.info("columnSpanMap: " + columnSpanMap); - - String[] selCols = selectedColumns.split(","); - for (String sc : selCols) { - if (columnSpanMap.containsKey(sc)) { - List colSpan = columnSpanMap.get(sc); - int start = colSpan.get(0); - int end = colSpan.get(1); - for (int i = start; i <= end; i++) { - includeColumns[i] = true; - } - } - } - - LOG.info("includeColumns: " + Arrays.toString(includeColumns)); - } - - private static Map> getColumnSpan(String allColumns, - ObjectInspector inspector) { - // map that contains the column span for each column. Column span is the number of columns - // required after flattening. For a given object inspector this map contains the start column - // id and end column id (both inclusive) after flattening. - // EXAMPLE: - // schema: struct> - // column span map for the above struct will be - // a => [1,1], b => [2,2], c => [3,5] - Map> columnSpanMap = new HashMap<>(); - if (allColumns != null) { - String[] columns = allColumns.split(","); - int startIdx = 0; - int endIdx = 0; - if (inspector instanceof StructObjectInspector) { - StructObjectInspector soi = (StructObjectInspector) inspector; - List fields = soi.getAllStructFieldRefs(); - for (int i = 0; i < fields.size(); i++) { - StructField sf = fields.get(i); - - // we get the type (category) from object inspector but column name from the argument. - // The reason for this is hive (FileSinkOperator) does not pass the actual column names, - // instead it passes the internal column names (_col1,_col2). - ObjectInspector sfOI = sf.getFieldObjectInspector(); - String colName = columns[i]; - - startIdx = endIdx + 1; - switch (sfOI.getCategory()) { - case PRIMITIVE: - endIdx += 1; - break; - case STRUCT: - endIdx += 1; - StructObjectInspector structInsp = (StructObjectInspector) sfOI; - List structFields = structInsp.getAllStructFieldRefs(); - for (StructField structField : structFields) { - endIdx += getFlattenedColumnsCount(structField.getFieldObjectInspector()); - } - break; - case MAP: - endIdx += 1; - MapObjectInspector mapInsp = (MapObjectInspector) sfOI; - endIdx += getFlattenedColumnsCount(mapInsp.getMapKeyObjectInspector()); - endIdx += getFlattenedColumnsCount(mapInsp.getMapValueObjectInspector()); - break; - case LIST: - endIdx += 1; - ListObjectInspector listInsp = (ListObjectInspector) sfOI; - endIdx += getFlattenedColumnsCount(listInsp.getListElementObjectInspector()); - break; - case UNION: - endIdx += 1; - UnionObjectInspector unionInsp = (UnionObjectInspector) sfOI; - List choices = unionInsp.getObjectInspectors(); - for (ObjectInspector choice : choices) { - endIdx += getFlattenedColumnsCount(choice); - } - break; - default: - throw new IllegalArgumentException("Bad category: " + - inspector.getCategory()); - } - - columnSpanMap.put(colName, Lists.newArrayList(startIdx, endIdx)); - } - } - } - return columnSpanMap; - } - - /** - * Returns the number of columns after flatting complex types. - * - * @param inspector - object inspector - * @return - */ - public static int getFlattenedColumnsCount(ObjectInspector inspector) { - int numWriters = 0; - switch (inspector.getCategory()) { - case PRIMITIVE: - numWriters += 1; - break; - case STRUCT: - numWriters += 1; - StructObjectInspector structInsp = (StructObjectInspector) inspector; - List fields = structInsp.getAllStructFieldRefs(); - for (StructField field : fields) { - numWriters += getFlattenedColumnsCount(field.getFieldObjectInspector()); - } - break; - case MAP: - numWriters += 1; - MapObjectInspector mapInsp = (MapObjectInspector) inspector; - numWriters += getFlattenedColumnsCount(mapInsp.getMapKeyObjectInspector()); - numWriters += getFlattenedColumnsCount(mapInsp.getMapValueObjectInspector()); - break; - case LIST: - numWriters += 1; - ListObjectInspector listInsp = (ListObjectInspector) inspector; - numWriters += getFlattenedColumnsCount(listInsp.getListElementObjectInspector()); - break; - case UNION: - numWriters += 1; - UnionObjectInspector unionInsp = (UnionObjectInspector) inspector; - List choices = unionInsp.getObjectInspectors(); - for (ObjectInspector choice : choices) { - numWriters += getFlattenedColumnsCount(choice); - } - break; - default: - throw new IllegalArgumentException("Bad category: " + - inspector.getCategory()); - } - return numWriters; - } - public static org.apache.orc.CompressionCodec createCodec(org.apache.orc.CompressionKind kind) { switch (kind) { case NONE: diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RecordReaderUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RecordReaderUtils.java index 5253711664..bc882e09f0 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RecordReaderUtils.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RecordReaderUtils.java @@ -18,15 +18,11 @@ package org.apache.tajo.storage.thirdparty.orc; -import com.google.common.collect.ComparisonChain; -import org.apache.commons.lang.builder.HashCodeBuilder; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.io.DiskRange; import org.apache.hadoop.hive.common.io.DiskRangeList; -import org.apache.hadoop.hive.shims.HadoopShims; -import org.apache.hadoop.hive.shims.ShimLoader; import org.apache.orc.CompressionCodec; import org.apache.orc.DataReader; import org.apache.orc.OrcProto; @@ -38,15 +34,13 @@ import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; -import java.util.Map; -import java.util.TreeMap; public class RecordReaderUtils { public static class DefaultDataReader implements DataReader { private FSDataInputStream file; private ByteBufferAllocatorPool pool; - private HadoopShims.ZeroCopyReaderShim zcr; + private ZeroCopyAdapter zcr; private FileSystem fs; private Path path; private boolean useZeroCopy; @@ -113,7 +107,7 @@ public long getReadBytes() { * @throws IOException */ private DiskRangeList readDiskRanges(FSDataInputStream file, - HadoopShims.ZeroCopyReaderShim zcr, + ZeroCopyAdapter zcr, long base, DiskRangeList range, boolean doForceDirect) throws IOException { @@ -387,93 +381,13 @@ public static List getStreamBuffers(DiskRangeList range, long offset, return buffers; } - static HadoopShims.ZeroCopyReaderShim createZeroCopyShim(FSDataInputStream file, - CompressionCodec codec, ByteBufferAllocatorPool pool) throws IOException { + static ZeroCopyAdapter createZeroCopyShim(FSDataInputStream file, + CompressionCodec codec, ByteBufferAllocatorPool pool) throws IOException { if ((codec == null || ((codec instanceof DirectDecompressionCodec) && ((DirectDecompressionCodec) codec).isAvailable()))) { /* codec is null or is available */ - return ShimLoader.getHadoopShims().getZeroCopyReader(file, pool); + return new ZeroCopyAdapter(file, pool); } return null; } - - // this is an implementation copied from ElasticByteBufferPool in hadoop-2, - // which lacks a clear()/clean() operation - public final static class ByteBufferAllocatorPool implements HadoopShims.ByteBufferPoolShim { - private static final class Key implements Comparable { - private final int capacity; - private final long insertionGeneration; - - Key(int capacity, long insertionGeneration) { - this.capacity = capacity; - this.insertionGeneration = insertionGeneration; - } - - @Override - public int compareTo(Key other) { - return ComparisonChain.start().compare(capacity, other.capacity) - .compare(insertionGeneration, other.insertionGeneration).result(); - } - - @Override - public boolean equals(Object rhs) { - if (rhs == null) { - return false; - } - try { - Key o = (Key) rhs; - return (compareTo(o) == 0); - } catch (ClassCastException e) { - return false; - } - } - - @Override - public int hashCode() { - return new HashCodeBuilder().append(capacity).append(insertionGeneration) - .toHashCode(); - } - } - - private final TreeMap buffers = new TreeMap(); - - private final TreeMap directBuffers = new TreeMap(); - - private long currentGeneration = 0; - - private final TreeMap getBufferTree(boolean direct) { - return direct ? directBuffers : buffers; - } - - public void clear() { - buffers.clear(); - directBuffers.clear(); - } - - @Override - public ByteBuffer getBuffer(boolean direct, int length) { - TreeMap tree = getBufferTree(direct); - Map.Entry entry = tree.ceilingEntry(new Key(length, 0)); - if (entry == null) { - return direct ? ByteBuffer.allocateDirect(length) : ByteBuffer - .allocate(length); - } - tree.remove(entry.getKey()); - return entry.getValue(); - } - - @Override - public void putBuffer(ByteBuffer buffer) { - TreeMap tree = getBufferTree(buffer.isDirect()); - while (true) { - Key key = new Key(buffer.capacity(), currentGeneration++); - if (!tree.containsKey(key)) { - tree.put(key, buffer); - return; - } - // Buffers are indexed by (capacity, generation). - // If our key is not unique on the first try, we try again - } - } - } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZeroCopyAdapter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZeroCopyAdapter.java new file mode 100644 index 0000000000..2886fe7794 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZeroCopyAdapter.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.ReadOption; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.EnumSet; + +public class ZeroCopyAdapter { + private final FSDataInputStream in; + private final ByteBufferPoolAdapter pool; + private final static EnumSet CHECK_SUM = EnumSet + .noneOf(ReadOption.class); + private final static EnumSet NO_CHECK_SUM = EnumSet + .of(ReadOption.SKIP_CHECKSUMS); + + public ZeroCopyAdapter(FSDataInputStream in, ByteBufferAllocatorPool poolshim) { + this.in = in; + if (poolshim != null) { + pool = new ByteBufferPoolAdapter(poolshim); + } else { + pool = null; + } + } + + public final ByteBuffer readBuffer(int maxLength, boolean verifyChecksums) + throws IOException { + EnumSet options = NO_CHECK_SUM; + if (verifyChecksums) { + options = CHECK_SUM; + } + return this.in.read(this.pool, maxLength, options); + } + + public final void releaseBuffer(ByteBuffer buffer) { + this.in.releaseBuffer(buffer); + } +} From 0f03428e87c8fab920f7711402398ef6b6ab0dcc Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 21 Mar 2016 17:31:54 +0900 Subject: [PATCH 10/16] Fix failure on create table --- .../tajo/catalog/store/HiveCatalogStore.java | 11 ++++++ tajo-storage/tajo-storage-hdfs/pom.xml | 35 ------------------- .../apache/tajo/storage/orc/ORCAppender.java | 12 +++---- .../apache/tajo/storage/orc/OrcScanner.java | 4 +-- 4 files changed, 19 insertions(+), 43 deletions(-) diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java index cad3c2414a..95cbf18fa8 100644 --- a/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java +++ b/tajo-catalog/tajo-catalog-drivers/tajo-hive/src/main/java/org/apache/tajo/catalog/store/HiveCatalogStore.java @@ -38,6 +38,7 @@ import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe; import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.orc.OrcConf; import org.apache.parquet.hadoop.ParquetOutputFormat; import org.apache.tajo.BuiltinStorages; import org.apache.tajo.TajoConstants; @@ -564,6 +565,16 @@ public final void createTable(final CatalogProtos.TableDescProto tableDescProto) table.putToParameters(ParquetOutputFormat.COMPRESSION, tableDesc.getMeta().getProperty(ParquetOutputFormat.COMPRESSION)); } + } else if (tableDesc.getMeta().getDataFormat().equalsIgnoreCase(BuiltinStorages.ORC)) { + StorageFormatDescriptor descriptor = storageFormatFactory.get(IOConstants.ORC); + sd.setInputFormat(descriptor.getInputFormat()); + sd.setOutputFormat(descriptor.getOutputFormat()); + sd.getSerdeInfo().setSerializationLib(descriptor.getSerde()); + + if (tableDesc.getMeta().containsProperty(OrcConf.COMPRESS.getAttribute())) { + table.putToParameters(OrcConf.COMPRESS.getAttribute(), + tableDesc.getMeta().getProperty(OrcConf.COMPRESS.getAttribute())); + } } else { throw new UnsupportedException(tableDesc.getMeta().getDataFormat() + " in HivecatalogStore"); } diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index f940f5b681..aa6e6a66c2 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -372,41 +372,6 @@ hive-storage-api ${hive.version} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java index ebdfa3224e..fcbdb34742 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java @@ -86,10 +86,9 @@ public void flush() throws IOException { public void close() throws IOException { writer.close(); - // TODO: getOffset is not implemented yet -// if (tableStatsEnabled) { -// stats.setNumBytes(getOffset()); -// } + if (tableStatsEnabled) { + stats.setNumBytes(writer.getRawDataSize()); + } } @Override @@ -103,7 +102,7 @@ public TableStats getStats() { @Override public long getEstimatedOutputSize() throws IOException { - return writer.getRawDataSize() * writer.getNumberOfRows(); + return writer.getRawDataSize(); } private static OrcFile.WriterOptions buildWriterOptions(Configuration conf, TableMeta meta, Schema schema) { @@ -129,7 +128,8 @@ private static OrcFile.WriterOptions buildWriterOptions(Configuration conf, Tabl } private static CompressionKind getCompressionKind(TableMeta meta) { - String kindstr = meta.getProperty(StorageConstants.ORC_COMPRESSION, StorageConstants.DEFAULT_ORC_COMPRESSION_KIND); + String kindstr = meta.getProperty(OrcConf.COMPRESS.getAttribute(), + String.valueOf(OrcConf.COMPRESS.getDefaultValue())); if (kindstr.equalsIgnoreCase(CompressionKind.ZLIB.name())) { return CompressionKind.ZLIB; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 5b159919de..c8aa67b404 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -249,8 +249,6 @@ private static Options buildReaderOptions(TableMeta meta) { @Override public void init() throws IOException { - super.init(); - FileMetaInfo footerMetaData = extractMetaInfoFromFooter(fileSystem, path, maxLength); this.footerMetaAndPsBuffer = footerMetaData.footerMetaAndPsBuffer; MetaInfoObjExtractor rInfo = @@ -275,6 +273,8 @@ public void init() throws IOException { this.stripes = convertProtoStripesToStripes(rInfo.footer.getStripesList()); recordReader = createRecordReader(); + + super.init(); } @Override From c992eff19581e3a5ff820b15416b1391bcfe318d Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 21 Mar 2016 17:38:22 +0900 Subject: [PATCH 11/16] Move orc and storage-api jars to lib. --- tajo-dist/pom.xml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tajo-dist/pom.xml b/tajo-dist/pom.xml index b742f5eccf..652ab84204 100644 --- a/tajo-dist/pom.xml +++ b/tajo-dist/pom.xml @@ -156,10 +156,7 @@ run mkdir -p lib run cp -r $ROOT/tajo-storage/tajo-storage-hdfs/target/lib/hive-*.jar lib/ - - run mkdir hive - run mv lib/hive-*.jar hive/ - + run mkdir -p share/jdbc-dist run cp -r $ROOT/tajo-jdbc/target/tajo-jdbc-${project.version}-jar-with-dependencies.jar ./share/jdbc-dist/tajo-jdbc-${project.version}.jar From 53b2e851af1b981b6a04f1c7902adc0e79363db7 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 21 Mar 2016 18:50:48 +0900 Subject: [PATCH 12/16] Add log4j and jdo libraries to lib. --- tajo-dist/src/main/bin/tajo | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tajo-dist/src/main/bin/tajo b/tajo-dist/src/main/bin/tajo index c08c538201..007e960ffb 100755 --- a/tajo-dist/src/main/bin/tajo +++ b/tajo-dist/src/main/bin/tajo @@ -300,11 +300,15 @@ if [ ! -z ${HIVE_HOME} ] && [ -d ${HIVE_HOME} ] && [ -d ${HIVE_LIB} ]; then CLASSPATH=${CLASSPATH}:$f; done - for f in ${HIVE_LIB}/datanucleus-*.jar; do + for f in ${HIVE_LIB}/javax.jdo-*.jar; do CLASSPATH=${CLASSPATH}:$f; done -else - for f in $TAJO_HOME/hive/*.jar; do + + for f in ${HIVE_LIB}/log4j-core-*.jar; do + CLASSPATH=${CLASSPATH}:$f; + done + + for f in ${HIVE_LIB}/datanucleus-*.jar; do CLASSPATH=${CLASSPATH}:$f; done fi From 3a5d3e8eedf89d10191362cbcfe3974236934530 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 21 Mar 2016 20:04:49 +0900 Subject: [PATCH 13/16] Remove commented out code --- .../thirdparty/orc/TreeReaderFactory.java | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java index 9b3f568fd2..1a17860c69 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java @@ -851,23 +851,6 @@ Datum next() throws IOException { if (valuePresent) { long millis = decodeTimestamp(data.next(), nanos.next(), base_timestamp); long adjustedMillis = millis - timeZone.getRawOffset(); -// long offset = 0; -// // If reader and writer time zones have different rules, adjust the timezone difference -// // between reader and writer taking day light savings into account. -// if (!hasSameTZRules) { -// offset = writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(millis); -// } -// long adjustedMillis = millis + offset; -// -// // Sometimes the reader timezone might have changed after adding the adjustedMillis. -// // To account for that change, check for any difference in reader timezone after -// // adding adjustedMillis. If so use the new offset (offset at adjustedMillis point of time). -// if (!hasSameTZRules && -// (readerTimeZone.getOffset(millis) != readerTimeZone.getOffset(adjustedMillis))) { -// long newOffset = -// writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(adjustedMillis); -// adjustedMillis = millis + newOffset; -// } return DatumFactory.createTimestamp(DateTimeUtil.javaTimeToJulianTime(adjustedMillis)); } else { return NullDatum.get(); From 7e6e5271919564f8fc5cf67930a83717eecda48c Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Tue, 22 Mar 2016 12:34:35 +0900 Subject: [PATCH 14/16] Fix timezone bug --- .../org/apache/tajo/jdbc/TajoResultSetBase.java | 5 ++--- .../java/org/apache/tajo/datum/DateDatum.java | 17 +++++++++++++++++ .../org/apache/tajo/datum/DatumFactory.java | 4 ++-- .../java/org/apache/tajo/datum/TimeDatum.java | 10 +--------- .../org/apache/tajo/datum/TimestampDatum.java | 17 +---------------- .../apache/tajo/util/datetime/DateTimeUtil.java | 15 +++++++++++++++ .../apache/tajo/datum/TestTimestampDatum.java | 2 +- .../apache/tajo/engine/eval/ExprTestBase.java | 4 ++-- .../tajo/engine/eval/TestSQLExpression.java | 6 +++--- .../storage/TextSerializerDeserializer.java | 5 +++-- .../tajo/storage/json/JsonLineSerializer.java | 5 +++-- .../apache/tajo/storage/orc/ORCAppender.java | 13 ++++++------- .../text/TextFieldSerializerDeserializer.java | 5 +++-- .../thirdparty/orc/TreeReaderFactory.java | 5 ++--- 14 files changed, 61 insertions(+), 52 deletions(-) diff --git a/tajo-client/src/main/java/org/apache/tajo/jdbc/TajoResultSetBase.java b/tajo-client/src/main/java/org/apache/tajo/jdbc/TajoResultSetBase.java index 5cc43096b0..b105324b04 100644 --- a/tajo-client/src/main/java/org/apache/tajo/jdbc/TajoResultSetBase.java +++ b/tajo-client/src/main/java/org/apache/tajo/jdbc/TajoResultSetBase.java @@ -22,7 +22,6 @@ import org.apache.tajo.SessionVars; import org.apache.tajo.catalog.Schema; import org.apache.tajo.common.TajoDataTypes; -import org.apache.tajo.datum.*; import org.apache.tajo.storage.Tuple; import org.apache.tajo.util.datetime.DateTimeUtil; import org.apache.tajo.util.datetime.TimeMeta; @@ -268,10 +267,10 @@ private String getString(Tuple tuple, int index) throws SQLException { switch(tuple.type(index)) { case BOOLEAN: return String.valueOf(tuple.getBool(index)); + case DATE: case TIME: - return TimeDatum.asChars(tuple.getTimeDate(index), timezone, false); case TIMESTAMP: - return TimestampDatum.asChars(tuple.getTimeDate(index), timezone, false); + return DateTimeUtil.tmToChars(tuple.getTimeDate(index), timezone, false); default : return tuple.asDatum(index).asChars(); } diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/DateDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/DateDatum.java index ac84e259dd..7044767a8f 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/DateDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/DateDatum.java @@ -30,6 +30,8 @@ import org.apache.tajo.util.datetime.DateTimeUtil; import org.apache.tajo.util.datetime.TimeMeta; +import java.util.TimeZone; + public class DateDatum extends Datum { public static final int SIZE = 4; @@ -186,6 +188,21 @@ public String asChars() { return DateTimeUtil.encodeDate(asTimeMeta(), DateStyle.ISO_DATES); } + /** + * + * @param tm TimeMeta + * @param timeZone Timezone + * @param includeTimeZone Add timezone if it is true. It is usually used for TIMEZONEZ + * @return A timestamp string + */ + public static String asChars(TimeMeta tm, TimeZone timeZone, boolean includeTimeZone) { + DateTimeUtil.toUserTimezone(tm, timeZone); + if (includeTimeZone) { + tm.timeZone = timeZone.getRawOffset() / 1000; + } + return DateTimeUtil.encodeDateTime(tm, DateStyle.ISO_DATES); + } + public String toChars(String format) { return DateTimeFormat.to_char(asTimeMeta(), format); } diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java b/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java index dd4a4e440a..1f0a90f8d4 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java @@ -431,7 +431,7 @@ public static Datum cast(Datum operandDatum, DataType target, @Nullable TimeZone case TIMESTAMP: { TimestampDatum timestampDatum = (TimestampDatum)operandDatum; if (tz != null) { - return DatumFactory.createText(TimestampDatum.asChars(operandDatum.asTimeMeta(), tz, false)); + return DatumFactory.createText(DateTimeUtil.tmToChars(operandDatum.asTimeMeta(), tz, false)); } else { return DatumFactory.createText(timestampDatum.asChars()); } @@ -439,7 +439,7 @@ public static Datum cast(Datum operandDatum, DataType target, @Nullable TimeZone case TIME: { TimeDatum timeDatum = (TimeDatum)operandDatum; if (tz != null) { - return DatumFactory.createText(TimeDatum.asChars(operandDatum.asTimeMeta(), tz, false)); + return DatumFactory.createText(DateTimeUtil.tmToChars(operandDatum.asTimeMeta(), tz, false)); } else { return DatumFactory.createText(timeDatum.asChars()); } diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TimeDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TimeDatum.java index e70d7d5a47..d45d8c3e55 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/TimeDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/TimeDatum.java @@ -98,16 +98,8 @@ public String asChars() { return DateTimeUtil.encodeTime(tm, DateStyle.ISO_DATES); } - public static String asChars(TimeMeta tm, TimeZone timeZone, boolean includeTimeZone) { - DateTimeUtil.toUserTimezone(tm, timeZone); - if (includeTimeZone) { - tm.timeZone = timeZone.getRawOffset() / 1000; - } - return DateTimeUtil.encodeTime(tm, DateStyle.ISO_DATES); - } - public String toString(TimeZone timeZone, boolean includeTimeZone) { - return asChars(asTimeMeta(), timeZone, includeTimeZone); + return DateTimeUtil.tmToChars(asTimeMeta(), timeZone, includeTimeZone); } @Override diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java index f69e7da2f4..9c815ee197 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java @@ -123,23 +123,8 @@ public String toString() { return asChars(); } - /** - * - * @param tm TimeMeta - * @param timeZone Timezone - * @param includeTimeZone Add timezone if it is true. It is usually used for TIMEZONEZ - * @return A timestamp string - */ - public static String asChars(TimeMeta tm, TimeZone timeZone, boolean includeTimeZone) { - DateTimeUtil.toUserTimezone(tm, timeZone); - if (includeTimeZone) { - tm.timeZone = timeZone.getRawOffset() / 1000; - } - return DateTimeUtil.encodeDateTime(tm, DateStyle.ISO_DATES); - } - public String toString(TimeZone timeZone, boolean includeTimeZone) { - return asChars(asTimeMeta(), timeZone, includeTimeZone); + return DateTimeUtil.tmToChars(asTimeMeta(), timeZone, includeTimeZone); } @Override diff --git a/tajo-common/src/main/java/org/apache/tajo/util/datetime/DateTimeUtil.java b/tajo-common/src/main/java/org/apache/tajo/util/datetime/DateTimeUtil.java index 5a338d39a5..d0ac43b5f7 100644 --- a/tajo-common/src/main/java/org/apache/tajo/util/datetime/DateTimeUtil.java +++ b/tajo-common/src/main/java/org/apache/tajo/util/datetime/DateTimeUtil.java @@ -2185,4 +2185,19 @@ public static TimeMeta getUTCDateTime(long time) { return tm; } + /** + * + * @param tm TimeMeta + * @param timeZone Timezone + * @param includeTimeZone Add timezone if it is true. It is usually used for TIMEZONEZ + * @return A timestamp string + */ + public static String tmToChars(TimeMeta tm, TimeZone timeZone, boolean includeTimeZone) { + DateTimeUtil.toUserTimezone(tm, timeZone); + if (includeTimeZone) { + tm.timeZone = timeZone.getRawOffset() / 1000; + } + return DateTimeUtil.encodeDateTime(tm, DateStyle.ISO_DATES); + } + } diff --git a/tajo-common/src/test/java/org/apache/tajo/datum/TestTimestampDatum.java b/tajo-common/src/test/java/org/apache/tajo/datum/TestTimestampDatum.java index dc8a8819bf..2c1fd7c5d4 100644 --- a/tajo-common/src/test/java/org/apache/tajo/datum/TestTimestampDatum.java +++ b/tajo-common/src/test/java/org/apache/tajo/datum/TestTimestampDatum.java @@ -120,7 +120,7 @@ public final void testToJson() { public final void testTimeZone() { TimestampDatum datum = new TimestampDatum(DateTimeUtil.toJulianTimestamp(2014, 5, 1, 15, 20, 30, 0)); assertEquals("2014-05-01 15:20:30", datum.asChars()); - assertEquals("2014-05-02 00:20:30+09", TimestampDatum.asChars(datum.asTimeMeta(), TimeZone.getTimeZone("GMT+9"), true)); + assertEquals("2014-05-02 00:20:30+09", DateTimeUtil.tmToChars(datum.asTimeMeta(), TimeZone.getTimeZone("GMT+9"), true)); } @Test diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java index 61faee619e..afb512daea 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java @@ -315,9 +315,9 @@ public void testEval(OverridableConf context, Schema schema, String tableName, S for (int i = 0; i < expected.length; i++) { String outTupleAsChars; if (outTuple.type(i) == Type.TIMESTAMP) { - outTupleAsChars = TimestampDatum.asChars(outTuple.getTimeDate(i), timeZone, false); + outTupleAsChars = DateTimeUtil.tmToChars(outTuple.getTimeDate(i), timeZone, false); } else if (outTuple.type(i) == Type.TIME) { - outTupleAsChars = TimeDatum.asChars(outTuple.getTimeDate(i), timeZone, false); + outTupleAsChars = DateTimeUtil.tmToChars(outTuple.getTimeDate(i), timeZone, false); } else { outTupleAsChars = outTuple.asDatum(i).toString(); } diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/TestSQLExpression.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/TestSQLExpression.java index fe51aa4eff..120483bef5 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/TestSQLExpression.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/TestSQLExpression.java @@ -861,7 +861,7 @@ public void testCastWithNestedFunction() throws TajoException { int unixtime = 1389071574; // (int) (System.currentTimeMillis() / 1000); TimestampDatum expected = DatumFactory.createTimestmpDatumWithUnixTime(unixtime); testSimpleEval(context, String.format("select to_timestamp(CAST(split_part('%d.999', '.', 1) as INT8));", unixtime), - new String[] {TimestampDatum.asChars(expected.asTimeMeta(), tz, false)}); + new String[] {DateTimeUtil.tmToChars(expected.asTimeMeta(), tz, false)}); } @Test @@ -887,11 +887,11 @@ public void testCastFromTable() throws TajoException { testEval(queryContext, schema, "table1", "1980-04-01 01:50:01,234", "select col1::timestamp as t1, col2::float from table1 where t1 = '1980-04-01 01:50:01'::timestamp", - new String[]{TimestampDatum.asChars(timestamp.asTimeMeta(), tz, false), "234.0"} + new String[]{DateTimeUtil.tmToChars(timestamp.asTimeMeta(), tz, false), "234.0"} ); testSimpleEval("select '1980-04-01 01:50:01'::timestamp;", new String[]{ - TimestampDatum.asChars(timestamp.asTimeMeta(), tz, false)}); + DateTimeUtil.tmToChars(timestamp.asTimeMeta(), tz, false)}); testSimpleEval("select '1980-04-01 01:50:01'::timestamp::text", new String[]{"1980-04-01 01:50:01"}); testSimpleEval("select (cast ('99999'::int8 as text))::int4 + 1", new String[]{"100000"}); diff --git a/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/TextSerializerDeserializer.java b/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/TextSerializerDeserializer.java index 1ec13bcdb2..313ff3194d 100644 --- a/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/TextSerializerDeserializer.java +++ b/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/TextSerializerDeserializer.java @@ -28,6 +28,7 @@ import org.apache.tajo.exception.ValueTooLongForTypeCharactersException; import org.apache.tajo.util.Bytes; import org.apache.tajo.util.NumberUtil; +import org.apache.tajo.util.datetime.DateTimeUtil; import java.io.IOException; import java.io.OutputStream; @@ -101,12 +102,12 @@ public int serialize(int index, Tuple tuple, OutputStream out, byte[] nullCharac out.write(bytes); break; case TIME: - bytes = TimeDatum.asChars(tuple.getTimeDate(index), TimeZone.getDefault(), true).getBytes(); + bytes = DateTimeUtil.tmToChars(tuple.getTimeDate(index), TimeZone.getDefault(), true).getBytes(); length = bytes.length; out.write(bytes); break; case TIMESTAMP: - bytes = TimestampDatum.asChars(tuple.getTimeDate(index), TimeZone.getDefault(), true).getBytes(); + bytes = DateTimeUtil.tmToChars(tuple.getTimeDate(index), TimeZone.getDefault(), true).getBytes(); length = bytes.length; out.write(bytes); break; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineSerializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineSerializer.java index 1885c80ca3..12e2d09078 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineSerializer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineSerializer.java @@ -35,6 +35,7 @@ import org.apache.tajo.storage.StorageConstants; import org.apache.tajo.storage.Tuple; import org.apache.tajo.storage.text.TextLineSerializer; +import org.apache.tajo.util.datetime.DateTimeUtil; import java.io.IOException; import java.io.OutputStream; @@ -117,14 +118,14 @@ private void putValue(JSONObject json, case TIMESTAMP: if (hasTimezone) { - json.put(fieldName, TimestampDatum.asChars(input.getTimeDate(fieldIndex), timezone, false)); + json.put(fieldName, DateTimeUtil.tmToChars(input.getTimeDate(fieldIndex), timezone, false)); } else { json.put(fieldName, input.asDatum(fieldIndex).asChars()); } break; case TIME: if (hasTimezone) { - json.put(fieldName, TimeDatum.asChars(input.getTimeDate(fieldIndex), timezone, false)); + json.put(fieldName, DateTimeUtil.tmToChars(input.getTimeDate(fieldIndex), timezone, false)); } else { json.put(fieldName, input.asDatum(fieldIndex).asChars()); } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java index fcbdb34742..b27c6401cf 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java @@ -50,8 +50,9 @@ public ORCAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schem TableMeta meta, Path workDir) { super(conf, taskAttemptId, schema, meta, workDir); - timezone = TimeZone.getTimeZone(meta.getProperty(StorageConstants.TIMEZONE, - TajoConstants.DEFAULT_SYSTEM_TIMEZONE)); + timezone = meta.containsProperty(StorageConstants.TIMEZONE) ? + TimeZone.getTimeZone(meta.getProperty(StorageConstants.TIMEZONE)) : + TimeZone.getDefault(); } @Override @@ -86,9 +87,9 @@ public void flush() throws IOException { public void close() throws IOException { writer.close(); - if (tableStatsEnabled) { - stats.setNumBytes(writer.getRawDataSize()); - } +// if (tableStatsEnabled) { +// stats.setNumBytes(getOffset()); +// } } @Override @@ -150,7 +151,6 @@ private static CompressionKind getCompressionKind(TableMeta meta) { * Options for creating ORC file writers. */ public static class WriterOptions extends OrcFile.WriterOptions { - private boolean explicitSchema = false; // Setting the default batch size to 1000 makes the memory check at 5000 // rows work the same as the row by row writer. (If it was the default 1024, // the smallest stripe size would be 5120 rows, which changes the output @@ -167,7 +167,6 @@ public WriterOptions(Properties tableProperties, Configuration conf) { * @return this */ public WriterOptions setSchema(TypeDescription schema) { - this.explicitSchema = true; super.setSchema(schema); return this; } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java index 53b6bcda46..c48098de59 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java @@ -35,6 +35,7 @@ import org.apache.tajo.storage.Tuple; import org.apache.tajo.util.Bytes; import org.apache.tajo.util.NumberUtil; +import org.apache.tajo.util.datetime.DateTimeUtil; import java.io.IOException; import java.io.OutputStream; @@ -124,7 +125,7 @@ public int serialize(int columnIndex, Tuple tuple, OutputStream out, byte[] null break; case TIME: if (hasTimezone) { - bytes = TimeDatum.asChars(tuple.getTimeDate(columnIndex), timezone, false).getBytes(Bytes.UTF8_CHARSET); + bytes = DateTimeUtil.tmToChars(tuple.getTimeDate(columnIndex), timezone, false).getBytes(Bytes.UTF8_CHARSET); } else { bytes = tuple.getTextBytes(columnIndex); } @@ -133,7 +134,7 @@ public int serialize(int columnIndex, Tuple tuple, OutputStream out, byte[] null break; case TIMESTAMP: if (hasTimezone) { - bytes = TimestampDatum.asChars(tuple.getTimeDate(columnIndex), timezone, false).getBytes(Bytes.UTF8_CHARSET); + bytes = DateTimeUtil.tmToChars(tuple.getTimeDate(columnIndex), timezone, false).getBytes(Bytes.UTF8_CHARSET); } else { bytes = tuple.getTextBytes(columnIndex); } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java index 1a17860c69..6ab630aed1 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java @@ -803,8 +803,7 @@ void startStripe(Map streams, nanos = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(), streams.get(new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.SECONDARY)), false, skipCorrupt); - base_timestamp = getBaseTimestamp(stripeFooter.getWriterTimezone()); - this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / DateTimeConstants.MSECS_PER_SEC; + getBaseTimestamp(stripeFooter.getWriterTimezone()); } private long getBaseTimestamp(String timeZoneId) throws IOException { @@ -850,7 +849,7 @@ Datum next() throws IOException { if (valuePresent) { long millis = decodeTimestamp(data.next(), nanos.next(), base_timestamp); - long adjustedMillis = millis - timeZone.getRawOffset(); + long adjustedMillis = millis - writerTimeZone.getRawOffset(); return DatumFactory.createTimestamp(DateTimeUtil.javaTimeToJulianTime(adjustedMillis)); } else { return NullDatum.get(); From 65936e6973c68d9a39a0c27f79bf972ba1c19bf6 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Tue, 22 Mar 2016 14:05:08 +0900 Subject: [PATCH 15/16] Revert "Fix timezone bug" This reverts commit 7e6e5271919564f8fc5cf67930a83717eecda48c. --- .../org/apache/tajo/jdbc/TajoResultSetBase.java | 5 +++-- .../java/org/apache/tajo/datum/DateDatum.java | 17 ----------------- .../org/apache/tajo/datum/DatumFactory.java | 4 ++-- .../java/org/apache/tajo/datum/TimeDatum.java | 10 +++++++++- .../org/apache/tajo/datum/TimestampDatum.java | 17 ++++++++++++++++- .../apache/tajo/util/datetime/DateTimeUtil.java | 15 --------------- .../apache/tajo/datum/TestTimestampDatum.java | 2 +- .../apache/tajo/engine/eval/ExprTestBase.java | 4 ++-- .../tajo/engine/eval/TestSQLExpression.java | 6 +++--- .../storage/TextSerializerDeserializer.java | 5 ++--- .../tajo/storage/json/JsonLineSerializer.java | 5 ++--- .../apache/tajo/storage/orc/ORCAppender.java | 13 +++++++------ .../text/TextFieldSerializerDeserializer.java | 5 ++--- .../thirdparty/orc/TreeReaderFactory.java | 5 +++-- 14 files changed, 52 insertions(+), 61 deletions(-) diff --git a/tajo-client/src/main/java/org/apache/tajo/jdbc/TajoResultSetBase.java b/tajo-client/src/main/java/org/apache/tajo/jdbc/TajoResultSetBase.java index b105324b04..5cc43096b0 100644 --- a/tajo-client/src/main/java/org/apache/tajo/jdbc/TajoResultSetBase.java +++ b/tajo-client/src/main/java/org/apache/tajo/jdbc/TajoResultSetBase.java @@ -22,6 +22,7 @@ import org.apache.tajo.SessionVars; import org.apache.tajo.catalog.Schema; import org.apache.tajo.common.TajoDataTypes; +import org.apache.tajo.datum.*; import org.apache.tajo.storage.Tuple; import org.apache.tajo.util.datetime.DateTimeUtil; import org.apache.tajo.util.datetime.TimeMeta; @@ -267,10 +268,10 @@ private String getString(Tuple tuple, int index) throws SQLException { switch(tuple.type(index)) { case BOOLEAN: return String.valueOf(tuple.getBool(index)); - case DATE: case TIME: + return TimeDatum.asChars(tuple.getTimeDate(index), timezone, false); case TIMESTAMP: - return DateTimeUtil.tmToChars(tuple.getTimeDate(index), timezone, false); + return TimestampDatum.asChars(tuple.getTimeDate(index), timezone, false); default : return tuple.asDatum(index).asChars(); } diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/DateDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/DateDatum.java index 7044767a8f..ac84e259dd 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/DateDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/DateDatum.java @@ -30,8 +30,6 @@ import org.apache.tajo.util.datetime.DateTimeUtil; import org.apache.tajo.util.datetime.TimeMeta; -import java.util.TimeZone; - public class DateDatum extends Datum { public static final int SIZE = 4; @@ -188,21 +186,6 @@ public String asChars() { return DateTimeUtil.encodeDate(asTimeMeta(), DateStyle.ISO_DATES); } - /** - * - * @param tm TimeMeta - * @param timeZone Timezone - * @param includeTimeZone Add timezone if it is true. It is usually used for TIMEZONEZ - * @return A timestamp string - */ - public static String asChars(TimeMeta tm, TimeZone timeZone, boolean includeTimeZone) { - DateTimeUtil.toUserTimezone(tm, timeZone); - if (includeTimeZone) { - tm.timeZone = timeZone.getRawOffset() / 1000; - } - return DateTimeUtil.encodeDateTime(tm, DateStyle.ISO_DATES); - } - public String toChars(String format) { return DateTimeFormat.to_char(asTimeMeta(), format); } diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java b/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java index 1f0a90f8d4..dd4a4e440a 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java @@ -431,7 +431,7 @@ public static Datum cast(Datum operandDatum, DataType target, @Nullable TimeZone case TIMESTAMP: { TimestampDatum timestampDatum = (TimestampDatum)operandDatum; if (tz != null) { - return DatumFactory.createText(DateTimeUtil.tmToChars(operandDatum.asTimeMeta(), tz, false)); + return DatumFactory.createText(TimestampDatum.asChars(operandDatum.asTimeMeta(), tz, false)); } else { return DatumFactory.createText(timestampDatum.asChars()); } @@ -439,7 +439,7 @@ public static Datum cast(Datum operandDatum, DataType target, @Nullable TimeZone case TIME: { TimeDatum timeDatum = (TimeDatum)operandDatum; if (tz != null) { - return DatumFactory.createText(DateTimeUtil.tmToChars(operandDatum.asTimeMeta(), tz, false)); + return DatumFactory.createText(TimeDatum.asChars(operandDatum.asTimeMeta(), tz, false)); } else { return DatumFactory.createText(timeDatum.asChars()); } diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TimeDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TimeDatum.java index d45d8c3e55..e70d7d5a47 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/TimeDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/TimeDatum.java @@ -98,8 +98,16 @@ public String asChars() { return DateTimeUtil.encodeTime(tm, DateStyle.ISO_DATES); } + public static String asChars(TimeMeta tm, TimeZone timeZone, boolean includeTimeZone) { + DateTimeUtil.toUserTimezone(tm, timeZone); + if (includeTimeZone) { + tm.timeZone = timeZone.getRawOffset() / 1000; + } + return DateTimeUtil.encodeTime(tm, DateStyle.ISO_DATES); + } + public String toString(TimeZone timeZone, boolean includeTimeZone) { - return DateTimeUtil.tmToChars(asTimeMeta(), timeZone, includeTimeZone); + return asChars(asTimeMeta(), timeZone, includeTimeZone); } @Override diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java index 9c815ee197..f69e7da2f4 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java @@ -123,8 +123,23 @@ public String toString() { return asChars(); } + /** + * + * @param tm TimeMeta + * @param timeZone Timezone + * @param includeTimeZone Add timezone if it is true. It is usually used for TIMEZONEZ + * @return A timestamp string + */ + public static String asChars(TimeMeta tm, TimeZone timeZone, boolean includeTimeZone) { + DateTimeUtil.toUserTimezone(tm, timeZone); + if (includeTimeZone) { + tm.timeZone = timeZone.getRawOffset() / 1000; + } + return DateTimeUtil.encodeDateTime(tm, DateStyle.ISO_DATES); + } + public String toString(TimeZone timeZone, boolean includeTimeZone) { - return DateTimeUtil.tmToChars(asTimeMeta(), timeZone, includeTimeZone); + return asChars(asTimeMeta(), timeZone, includeTimeZone); } @Override diff --git a/tajo-common/src/main/java/org/apache/tajo/util/datetime/DateTimeUtil.java b/tajo-common/src/main/java/org/apache/tajo/util/datetime/DateTimeUtil.java index d0ac43b5f7..5a338d39a5 100644 --- a/tajo-common/src/main/java/org/apache/tajo/util/datetime/DateTimeUtil.java +++ b/tajo-common/src/main/java/org/apache/tajo/util/datetime/DateTimeUtil.java @@ -2185,19 +2185,4 @@ public static TimeMeta getUTCDateTime(long time) { return tm; } - /** - * - * @param tm TimeMeta - * @param timeZone Timezone - * @param includeTimeZone Add timezone if it is true. It is usually used for TIMEZONEZ - * @return A timestamp string - */ - public static String tmToChars(TimeMeta tm, TimeZone timeZone, boolean includeTimeZone) { - DateTimeUtil.toUserTimezone(tm, timeZone); - if (includeTimeZone) { - tm.timeZone = timeZone.getRawOffset() / 1000; - } - return DateTimeUtil.encodeDateTime(tm, DateStyle.ISO_DATES); - } - } diff --git a/tajo-common/src/test/java/org/apache/tajo/datum/TestTimestampDatum.java b/tajo-common/src/test/java/org/apache/tajo/datum/TestTimestampDatum.java index 2c1fd7c5d4..dc8a8819bf 100644 --- a/tajo-common/src/test/java/org/apache/tajo/datum/TestTimestampDatum.java +++ b/tajo-common/src/test/java/org/apache/tajo/datum/TestTimestampDatum.java @@ -120,7 +120,7 @@ public final void testToJson() { public final void testTimeZone() { TimestampDatum datum = new TimestampDatum(DateTimeUtil.toJulianTimestamp(2014, 5, 1, 15, 20, 30, 0)); assertEquals("2014-05-01 15:20:30", datum.asChars()); - assertEquals("2014-05-02 00:20:30+09", DateTimeUtil.tmToChars(datum.asTimeMeta(), TimeZone.getTimeZone("GMT+9"), true)); + assertEquals("2014-05-02 00:20:30+09", TimestampDatum.asChars(datum.asTimeMeta(), TimeZone.getTimeZone("GMT+9"), true)); } @Test diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java index afb512daea..61faee619e 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java @@ -315,9 +315,9 @@ public void testEval(OverridableConf context, Schema schema, String tableName, S for (int i = 0; i < expected.length; i++) { String outTupleAsChars; if (outTuple.type(i) == Type.TIMESTAMP) { - outTupleAsChars = DateTimeUtil.tmToChars(outTuple.getTimeDate(i), timeZone, false); + outTupleAsChars = TimestampDatum.asChars(outTuple.getTimeDate(i), timeZone, false); } else if (outTuple.type(i) == Type.TIME) { - outTupleAsChars = DateTimeUtil.tmToChars(outTuple.getTimeDate(i), timeZone, false); + outTupleAsChars = TimeDatum.asChars(outTuple.getTimeDate(i), timeZone, false); } else { outTupleAsChars = outTuple.asDatum(i).toString(); } diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/TestSQLExpression.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/TestSQLExpression.java index 120483bef5..fe51aa4eff 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/TestSQLExpression.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/eval/TestSQLExpression.java @@ -861,7 +861,7 @@ public void testCastWithNestedFunction() throws TajoException { int unixtime = 1389071574; // (int) (System.currentTimeMillis() / 1000); TimestampDatum expected = DatumFactory.createTimestmpDatumWithUnixTime(unixtime); testSimpleEval(context, String.format("select to_timestamp(CAST(split_part('%d.999', '.', 1) as INT8));", unixtime), - new String[] {DateTimeUtil.tmToChars(expected.asTimeMeta(), tz, false)}); + new String[] {TimestampDatum.asChars(expected.asTimeMeta(), tz, false)}); } @Test @@ -887,11 +887,11 @@ public void testCastFromTable() throws TajoException { testEval(queryContext, schema, "table1", "1980-04-01 01:50:01,234", "select col1::timestamp as t1, col2::float from table1 where t1 = '1980-04-01 01:50:01'::timestamp", - new String[]{DateTimeUtil.tmToChars(timestamp.asTimeMeta(), tz, false), "234.0"} + new String[]{TimestampDatum.asChars(timestamp.asTimeMeta(), tz, false), "234.0"} ); testSimpleEval("select '1980-04-01 01:50:01'::timestamp;", new String[]{ - DateTimeUtil.tmToChars(timestamp.asTimeMeta(), tz, false)}); + TimestampDatum.asChars(timestamp.asTimeMeta(), tz, false)}); testSimpleEval("select '1980-04-01 01:50:01'::timestamp::text", new String[]{"1980-04-01 01:50:01"}); testSimpleEval("select (cast ('99999'::int8 as text))::int4 + 1", new String[]{"100000"}); diff --git a/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/TextSerializerDeserializer.java b/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/TextSerializerDeserializer.java index 313ff3194d..1ec13bcdb2 100644 --- a/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/TextSerializerDeserializer.java +++ b/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/TextSerializerDeserializer.java @@ -28,7 +28,6 @@ import org.apache.tajo.exception.ValueTooLongForTypeCharactersException; import org.apache.tajo.util.Bytes; import org.apache.tajo.util.NumberUtil; -import org.apache.tajo.util.datetime.DateTimeUtil; import java.io.IOException; import java.io.OutputStream; @@ -102,12 +101,12 @@ public int serialize(int index, Tuple tuple, OutputStream out, byte[] nullCharac out.write(bytes); break; case TIME: - bytes = DateTimeUtil.tmToChars(tuple.getTimeDate(index), TimeZone.getDefault(), true).getBytes(); + bytes = TimeDatum.asChars(tuple.getTimeDate(index), TimeZone.getDefault(), true).getBytes(); length = bytes.length; out.write(bytes); break; case TIMESTAMP: - bytes = DateTimeUtil.tmToChars(tuple.getTimeDate(index), TimeZone.getDefault(), true).getBytes(); + bytes = TimestampDatum.asChars(tuple.getTimeDate(index), TimeZone.getDefault(), true).getBytes(); length = bytes.length; out.write(bytes); break; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineSerializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineSerializer.java index 12e2d09078..1885c80ca3 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineSerializer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineSerializer.java @@ -35,7 +35,6 @@ import org.apache.tajo.storage.StorageConstants; import org.apache.tajo.storage.Tuple; import org.apache.tajo.storage.text.TextLineSerializer; -import org.apache.tajo.util.datetime.DateTimeUtil; import java.io.IOException; import java.io.OutputStream; @@ -118,14 +117,14 @@ private void putValue(JSONObject json, case TIMESTAMP: if (hasTimezone) { - json.put(fieldName, DateTimeUtil.tmToChars(input.getTimeDate(fieldIndex), timezone, false)); + json.put(fieldName, TimestampDatum.asChars(input.getTimeDate(fieldIndex), timezone, false)); } else { json.put(fieldName, input.asDatum(fieldIndex).asChars()); } break; case TIME: if (hasTimezone) { - json.put(fieldName, DateTimeUtil.tmToChars(input.getTimeDate(fieldIndex), timezone, false)); + json.put(fieldName, TimeDatum.asChars(input.getTimeDate(fieldIndex), timezone, false)); } else { json.put(fieldName, input.asDatum(fieldIndex).asChars()); } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java index b27c6401cf..fcbdb34742 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java @@ -50,9 +50,8 @@ public ORCAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schem TableMeta meta, Path workDir) { super(conf, taskAttemptId, schema, meta, workDir); - timezone = meta.containsProperty(StorageConstants.TIMEZONE) ? - TimeZone.getTimeZone(meta.getProperty(StorageConstants.TIMEZONE)) : - TimeZone.getDefault(); + timezone = TimeZone.getTimeZone(meta.getProperty(StorageConstants.TIMEZONE, + TajoConstants.DEFAULT_SYSTEM_TIMEZONE)); } @Override @@ -87,9 +86,9 @@ public void flush() throws IOException { public void close() throws IOException { writer.close(); -// if (tableStatsEnabled) { -// stats.setNumBytes(getOffset()); -// } + if (tableStatsEnabled) { + stats.setNumBytes(writer.getRawDataSize()); + } } @Override @@ -151,6 +150,7 @@ private static CompressionKind getCompressionKind(TableMeta meta) { * Options for creating ORC file writers. */ public static class WriterOptions extends OrcFile.WriterOptions { + private boolean explicitSchema = false; // Setting the default batch size to 1000 makes the memory check at 5000 // rows work the same as the row by row writer. (If it was the default 1024, // the smallest stripe size would be 5120 rows, which changes the output @@ -167,6 +167,7 @@ public WriterOptions(Properties tableProperties, Configuration conf) { * @return this */ public WriterOptions setSchema(TypeDescription schema) { + this.explicitSchema = true; super.setSchema(schema); return this; } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java index c48098de59..53b6bcda46 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java @@ -35,7 +35,6 @@ import org.apache.tajo.storage.Tuple; import org.apache.tajo.util.Bytes; import org.apache.tajo.util.NumberUtil; -import org.apache.tajo.util.datetime.DateTimeUtil; import java.io.IOException; import java.io.OutputStream; @@ -125,7 +124,7 @@ public int serialize(int columnIndex, Tuple tuple, OutputStream out, byte[] null break; case TIME: if (hasTimezone) { - bytes = DateTimeUtil.tmToChars(tuple.getTimeDate(columnIndex), timezone, false).getBytes(Bytes.UTF8_CHARSET); + bytes = TimeDatum.asChars(tuple.getTimeDate(columnIndex), timezone, false).getBytes(Bytes.UTF8_CHARSET); } else { bytes = tuple.getTextBytes(columnIndex); } @@ -134,7 +133,7 @@ public int serialize(int columnIndex, Tuple tuple, OutputStream out, byte[] null break; case TIMESTAMP: if (hasTimezone) { - bytes = DateTimeUtil.tmToChars(tuple.getTimeDate(columnIndex), timezone, false).getBytes(Bytes.UTF8_CHARSET); + bytes = TimestampDatum.asChars(tuple.getTimeDate(columnIndex), timezone, false).getBytes(Bytes.UTF8_CHARSET); } else { bytes = tuple.getTextBytes(columnIndex); } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java index 6ab630aed1..1a17860c69 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java @@ -803,7 +803,8 @@ void startStripe(Map streams, nanos = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(), streams.get(new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.SECONDARY)), false, skipCorrupt); - getBaseTimestamp(stripeFooter.getWriterTimezone()); + base_timestamp = getBaseTimestamp(stripeFooter.getWriterTimezone()); + this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / DateTimeConstants.MSECS_PER_SEC; } private long getBaseTimestamp(String timeZoneId) throws IOException { @@ -849,7 +850,7 @@ Datum next() throws IOException { if (valuePresent) { long millis = decodeTimestamp(data.next(), nanos.next(), base_timestamp); - long adjustedMillis = millis - writerTimeZone.getRawOffset(); + long adjustedMillis = millis - timeZone.getRawOffset(); return DatumFactory.createTimestamp(DateTimeUtil.javaTimeToJulianTime(adjustedMillis)); } else { return NullDatum.get(); From fa983bcbdb8d4f6899ac9f63914a8bcc001372e4 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Tue, 22 Mar 2016 14:38:41 +0900 Subject: [PATCH 16/16] Fix timestamp bug. --- .../org/apache/tajo/storage/orc/ORCAppender.java | 13 ++++++------- .../storage/thirdparty/orc/TreeReaderFactory.java | 5 ++--- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java index fcbdb34742..b27c6401cf 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java @@ -50,8 +50,9 @@ public ORCAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schem TableMeta meta, Path workDir) { super(conf, taskAttemptId, schema, meta, workDir); - timezone = TimeZone.getTimeZone(meta.getProperty(StorageConstants.TIMEZONE, - TajoConstants.DEFAULT_SYSTEM_TIMEZONE)); + timezone = meta.containsProperty(StorageConstants.TIMEZONE) ? + TimeZone.getTimeZone(meta.getProperty(StorageConstants.TIMEZONE)) : + TimeZone.getDefault(); } @Override @@ -86,9 +87,9 @@ public void flush() throws IOException { public void close() throws IOException { writer.close(); - if (tableStatsEnabled) { - stats.setNumBytes(writer.getRawDataSize()); - } +// if (tableStatsEnabled) { +// stats.setNumBytes(getOffset()); +// } } @Override @@ -150,7 +151,6 @@ private static CompressionKind getCompressionKind(TableMeta meta) { * Options for creating ORC file writers. */ public static class WriterOptions extends OrcFile.WriterOptions { - private boolean explicitSchema = false; // Setting the default batch size to 1000 makes the memory check at 5000 // rows work the same as the row by row writer. (If it was the default 1024, // the smallest stripe size would be 5120 rows, which changes the output @@ -167,7 +167,6 @@ public WriterOptions(Properties tableProperties, Configuration conf) { * @return this */ public WriterOptions setSchema(TypeDescription schema) { - this.explicitSchema = true; super.setSchema(schema); return this; } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java index 1a17860c69..6ab630aed1 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TreeReaderFactory.java @@ -803,8 +803,7 @@ void startStripe(Map streams, nanos = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(), streams.get(new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.SECONDARY)), false, skipCorrupt); - base_timestamp = getBaseTimestamp(stripeFooter.getWriterTimezone()); - this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / DateTimeConstants.MSECS_PER_SEC; + getBaseTimestamp(stripeFooter.getWriterTimezone()); } private long getBaseTimestamp(String timeZoneId) throws IOException { @@ -850,7 +849,7 @@ Datum next() throws IOException { if (valuePresent) { long millis = decodeTimestamp(data.next(), nanos.next(), base_timestamp); - long adjustedMillis = millis - timeZone.getRawOffset(); + long adjustedMillis = millis - writerTimeZone.getRawOffset(); return DatumFactory.createTimestamp(DateTimeUtil.javaTimeToJulianTime(adjustedMillis)); } else { return NullDatum.get();