-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Adds possibility to read '.gz' files when using local firehose #2394
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,8 +21,10 @@ | |
|
|
||
| import com.fasterxml.jackson.annotation.JsonCreator; | ||
| import com.fasterxml.jackson.annotation.JsonProperty; | ||
| import com.google.common.base.Charsets; | ||
| import com.google.common.base.Throwables; | ||
| import com.google.common.collect.Lists; | ||
| import com.metamx.common.CompressionUtils; | ||
| import com.metamx.common.IAE; | ||
| import com.metamx.common.ISE; | ||
| import com.metamx.emitter.EmittingLogger; | ||
|
|
@@ -31,12 +33,18 @@ | |
| import io.druid.data.input.impl.FileIteratingFirehose; | ||
| import io.druid.data.input.impl.StringInputRowParser; | ||
| import org.apache.commons.io.FileUtils; | ||
| import org.apache.commons.io.IOUtils; | ||
| import org.apache.commons.io.LineIterator; | ||
| import org.apache.commons.io.filefilter.TrueFileFilter; | ||
| import org.apache.commons.io.filefilter.WildcardFileFilter; | ||
|
|
||
| import java.io.BufferedReader; | ||
| import java.io.File; | ||
| import java.io.FileInputStream; | ||
| import java.io.FileNotFoundException; | ||
| import java.io.IOException; | ||
| import java.io.InputStream; | ||
| import java.io.InputStreamReader; | ||
| import java.util.Collection; | ||
| import java.util.Iterator; | ||
| import java.util.LinkedList; | ||
|
|
@@ -45,93 +53,122 @@ | |
| */ | ||
| public class LocalFirehoseFactory implements FirehoseFactory<StringInputRowParser> | ||
| { | ||
| private static final EmittingLogger log = new EmittingLogger(LocalFirehoseFactory.class); | ||
|
|
||
| private final File baseDir; | ||
| private final String filter; | ||
| private final StringInputRowParser parser; | ||
|
|
||
| @JsonCreator | ||
| public LocalFirehoseFactory( | ||
| @JsonProperty("baseDir") File baseDir, | ||
| @JsonProperty("filter") String filter, | ||
| // Backwards compatible | ||
| @JsonProperty("parser") StringInputRowParser parser | ||
| ) | ||
| { | ||
| this.baseDir = baseDir; | ||
| this.filter = filter; | ||
| this.parser = parser; | ||
| } | ||
|
|
||
| @JsonProperty | ||
| public File getBaseDir() | ||
| { | ||
| return baseDir; | ||
| } | ||
|
|
||
| @JsonProperty | ||
| public String getFilter() | ||
| { | ||
| return filter; | ||
| } | ||
|
|
||
| @JsonProperty | ||
| public StringInputRowParser getParser() | ||
| { | ||
| return parser; | ||
| } | ||
|
|
||
| @Override | ||
| public Firehose connect(StringInputRowParser firehoseParser) throws IOException | ||
| { | ||
| if (baseDir == null) { | ||
| throw new IAE("baseDir is null"); | ||
| } | ||
| log.info("Searching for all [%s] in and beneath [%s]", filter, baseDir.getAbsoluteFile()); | ||
|
|
||
| Collection<File> foundFiles = FileUtils.listFiles( | ||
| baseDir.getAbsoluteFile(), | ||
| new WildcardFileFilter(filter), | ||
| TrueFileFilter.INSTANCE | ||
| ); | ||
|
|
||
| if (foundFiles == null || foundFiles.isEmpty()) { | ||
| throw new ISE("Found no files to ingest! Check your schema."); | ||
| } | ||
| log.info ("Found files: " + foundFiles); | ||
|
|
||
| final LinkedList<File> files = Lists.newLinkedList( | ||
| foundFiles | ||
| ); | ||
|
|
||
| return new FileIteratingFirehose( | ||
| new Iterator<LineIterator>() | ||
| { | ||
| @Override | ||
| public boolean hasNext() | ||
| { | ||
| return !files.isEmpty(); | ||
| } | ||
|
|
||
| @Override | ||
| public LineIterator next() | ||
| { | ||
| try { | ||
| return FileUtils.lineIterator(files.poll()); | ||
| } | ||
| catch (Exception e) { | ||
| throw Throwables.propagate(e); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public void remove() | ||
| { | ||
| throw new UnsupportedOperationException(); | ||
| } | ||
| }, | ||
| firehoseParser | ||
| ); | ||
| } | ||
| private static final EmittingLogger log = new EmittingLogger(LocalFirehoseFactory.class); | ||
|
|
||
| private final File baseDir; | ||
| private final String filter; | ||
| private final StringInputRowParser parser; | ||
|
|
||
| @JsonCreator | ||
| public LocalFirehoseFactory( | ||
| @JsonProperty("baseDir") File baseDir, | ||
| @JsonProperty("filter") String filter, | ||
| // Backwards compatible | ||
| @JsonProperty("parser") StringInputRowParser parser) | ||
| { | ||
| this.baseDir = baseDir; | ||
| this.filter = filter; | ||
| this.parser = parser; | ||
| } | ||
|
|
||
| @JsonProperty | ||
| public File getBaseDir() | ||
| { | ||
| return baseDir; | ||
| } | ||
|
|
||
| @JsonProperty | ||
| public String getFilter() | ||
| { | ||
| return filter; | ||
| } | ||
|
|
||
| @JsonProperty | ||
| public StringInputRowParser getParser() | ||
| { | ||
| return parser; | ||
| } | ||
|
|
||
| @Override | ||
| public Firehose connect(StringInputRowParser firehoseParser) throws IOException | ||
| { | ||
| if (baseDir == null) | ||
| { | ||
| throw new IAE("baseDir is null"); | ||
| } | ||
| log.info("Searching for all [%s] in and beneath [%s]", filter, baseDir.getAbsoluteFile()); | ||
|
|
||
| Collection<File> foundFiles = FileUtils.listFiles( | ||
| baseDir.getAbsoluteFile(), | ||
| new WildcardFileFilter(filter), | ||
| TrueFileFilter.INSTANCE); | ||
|
|
||
| if (foundFiles == null || foundFiles.isEmpty()) | ||
| { | ||
| throw new ISE("Found no files to ingest! Check your schema."); | ||
| } | ||
| log.info("Found files: " + foundFiles); | ||
|
|
||
| final LinkedList<File> files = Lists.newLinkedList( | ||
| foundFiles); | ||
|
|
||
| return new FileIteratingFirehose( | ||
| new Iterator<LineIterator>() | ||
| { | ||
| @Override | ||
| public boolean hasNext() | ||
| { | ||
| return !files.isEmpty(); | ||
| } | ||
|
|
||
| @Override | ||
| public LineIterator next() | ||
| { | ||
| final File f = files.poll(); | ||
| InputStream rawInputStream = null; | ||
| try | ||
| { | ||
| rawInputStream = new FileInputStream(f); | ||
| final InputStream inputStream; | ||
| String logMessage; | ||
| if (CompressionUtils.isGz(f.getName())) | ||
| { | ||
| logMessage = "Reading gzipped file [%s]"; | ||
| inputStream = CompressionUtils.gzipInputStream(rawInputStream); | ||
| } else | ||
| { | ||
| logMessage = "Reading file [%s]"; | ||
| inputStream = rawInputStream; | ||
| } | ||
|
|
||
| log.info(logMessage, f.getName()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we just put the log.info in both places above instead? |
||
|
|
||
| return IOUtils.lineIterator( | ||
| new BufferedReader( | ||
| new InputStreamReader(inputStream, Charsets.UTF_8))); | ||
| } catch (Exception e) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do any of the checked exceptions include InterruptedException? If so it should be handled in a way that does not loose the interrupted status |
||
| { | ||
| log.warn(e, "Failed to read file [%s]", f.getName()); | ||
| if (rawInputStream != null) | ||
| { | ||
| try | ||
| { | ||
| rawInputStream.close(); | ||
| } catch (IOException ioe) | ||
| { | ||
| Throwables.propagate(ioe); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggest |
||
| } | ||
| } | ||
| throw Throwables.propagate(e); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public void remove() | ||
| { | ||
| throw new UnsupportedOperationException(); | ||
| } | ||
| }, | ||
| firehoseParser); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it appears you changed spaces to tabs. druid uses 2 spaces for indentation. pls use appropriate formatter as mentioned in https://github.com/druid-io/druid/blob/master/CONTRIBUTING.md