From 64a2fc6ee1190776bcbb46ecf6841b58ce2bf311 Mon Sep 17 00:00:00 2001 From: Michael Miklavcic Date: Wed, 25 Jan 2017 14:38:08 -0700 Subject: [PATCH 01/32] save some work and notes --- .../metron/dataloads/extractor/ExtractorHandler.java | 3 ++- .../apache/metron/dataloads/extractor/Extractors.java | 2 +- .../metron/dataloads/extractor/csv/CSVExtractor.java | 10 +++++----- .../dataloads/extractor/csv/CSVExtractorTest.java | 6 +++--- .../apache/metron/enrichment/lookup/LookupValue.java | 2 +- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorHandler.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorHandler.java index 89477d81b2..0c560ea3ae 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorHandler.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorHandler.java @@ -26,7 +26,6 @@ import java.io.InputStream; import java.lang.reflect.InvocationTargetException; import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; import java.util.Map; public class ExtractorHandler { @@ -60,6 +59,8 @@ public Extractor getExtractor() { } public void setExtractor(String extractor) { try { + // TODO look in config for txformations/filter + // if true, decorate extractors.create this.extractor = Extractors.create(extractor); } catch (ClassNotFoundException | IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) { throw new IllegalStateException("Unable to create an extractor", e); diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/Extractors.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/Extractors.java index 771a1e3fcd..5f63f4e1e4 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/Extractors.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/Extractors.java @@ -21,7 +21,6 @@ import org.apache.metron.dataloads.extractor.stix.StixExtractor; import java.lang.reflect.InvocationTargetException; -import java.util.Map; public enum Extractors implements ExtractorCreator { CSV(new ExtractorCreator() { @@ -48,6 +47,7 @@ public Extractor create() { } public static Extractor create(String extractorName) throws ClassNotFoundException, IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException { try { + //TODO create decorated extractor here - in init method setup Stellar ExtractorCreator ec = Extractors.valueOf(extractorName); return ec.create(); } diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/csv/CSVExtractor.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/csv/CSVExtractor.java index 502b46ad47..0cdccd1aee 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/csv/CSVExtractor.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/csv/CSVExtractor.java @@ -32,14 +32,14 @@ public class CSVExtractor extends CSVConverter implements Extractor { public static final String TYPE_KEY="type"; public static final String LOOKUP_CONVERTER = "lookup_converter"; - private int typeColumn; + private int typeColumnIndex; private String type; private int indicatorColumn; private LookupConverter converter = LookupConverters.ENRICHMENT.getConverter(); - public int getTypeColumn() { - return typeColumn; + public int getTypeColumnIndex() { + return typeColumnIndex; } public String getType() { @@ -73,7 +73,7 @@ public Iterable extract(String line) throws IOException { private String getType(String[] tokens) { if(type == null) { - return tokens[typeColumn]; + return tokens[typeColumnIndex]; } else { return type; @@ -93,7 +93,7 @@ public void initialize(Map config) { type = config.get(TYPE_KEY).toString(); } else if(config.containsKey(TYPE_COLUMN_KEY)) { - typeColumn = columnMap.get(config.get(TYPE_COLUMN_KEY).toString()); + typeColumnIndex = columnMap.get(config.get(TYPE_COLUMN_KEY).toString()); } if(config.containsKey(LOOKUP_CONVERTER)) { converter = LookupConverters.getConverter((String) config.get(LOOKUP_CONVERTER)); diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/csv/CSVExtractorTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/csv/CSVExtractorTest.java index 4e482add7d..fee504fac3 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/csv/CSVExtractorTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/csv/CSVExtractorTest.java @@ -36,11 +36,11 @@ public class CSVExtractorTest { "columns" : { "host" : 0 ,"meta" : 2 - } + } ,"indicator_column" : "host" ,"type" : "threat" ,"separator" : "," - } + } ,"extractor" : "CSV" } */ @@ -56,7 +56,7 @@ public void testInitialize() throws Exception { Assert.assertEquals(0, (int)ex.getColumnMap().get("host") ); Assert.assertEquals(2, (int)ex.getColumnMap().get("meta") ); - Assert.assertEquals(0, ex.getTypeColumn() ); + Assert.assertEquals(0, ex.getTypeColumnIndex() ); Assert.assertEquals(0, ex.getIndicatorColumn()); Assert.assertEquals("threat", ex.getType() ); Assert.assertEquals(',', ex.getParser().getSeparator()); diff --git a/metron-platform/metron-enrichment/src/main/java/org/apache/metron/enrichment/lookup/LookupValue.java b/metron-platform/metron-enrichment/src/main/java/org/apache/metron/enrichment/lookup/LookupValue.java index 24fbffd599..6cbad02884 100644 --- a/metron-platform/metron-enrichment/src/main/java/org/apache/metron/enrichment/lookup/LookupValue.java +++ b/metron-platform/metron-enrichment/src/main/java/org/apache/metron/enrichment/lookup/LookupValue.java @@ -20,9 +20,9 @@ package org.apache.metron.enrichment.lookup; import java.util.Map; -import java.util.NavigableMap; public interface LookupValue { Iterable> toColumns(); void fromColumns(Iterable> values); + Map getMetadata(); } From a6a6ab64e2777610ff57727195d3ce0d2c2c8cb1 Mon Sep 17 00:00:00 2001 From: Michael Miklavcic Date: Fri, 27 Jan 2017 07:25:54 -0700 Subject: [PATCH 02/32] Extraction done --- .../stellar/network/NetworkFunctionsTest.java | 1 + .../extractor/ExtractorDecorator.java | 25 +++ .../dataloads/extractor/ExtractorHandler.java | 103 ++++++----- .../dataloads/extractor/Extractors.java | 2 +- .../TransformFilterExtractorDecorator.java | 134 ++++++++++++++ .../dataloads/extractor/csv/CSVExtractor.java | 6 +- .../SimpleEnrichmentFlatFileLoaderTest.java | 165 +++++++++++------- 7 files changed, 324 insertions(+), 112 deletions(-) create mode 100644 metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorDecorator.java create mode 100644 metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java diff --git a/metron-platform/metron-common/src/test/java/org/apache/metron/common/stellar/network/NetworkFunctionsTest.java b/metron-platform/metron-common/src/test/java/org/apache/metron/common/stellar/network/NetworkFunctionsTest.java index 783658c212..d43d6fded9 100644 --- a/metron-platform/metron-common/src/test/java/org/apache/metron/common/stellar/network/NetworkFunctionsTest.java +++ b/metron-platform/metron-common/src/test/java/org/apache/metron/common/stellar/network/NetworkFunctionsTest.java @@ -82,6 +82,7 @@ public void toTldTest_unknowntld() { @Test public void removeTldTest() { + runWithArguments("DOMAIN_REMOVE_TLD", "google.com", "google"); runWithArguments("DOMAIN_REMOVE_TLD", "www.google.co.uk", "www.google"); runWithArguments("DOMAIN_REMOVE_TLD", "www.google.com", "www.google"); runWithArguments("DOMAIN_REMOVE_TLD", "com", ""); diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorDecorator.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorDecorator.java new file mode 100644 index 0000000000..0ac5527d89 --- /dev/null +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorDecorator.java @@ -0,0 +1,25 @@ +package org.apache.metron.dataloads.extractor; + +import org.apache.metron.enrichment.lookup.LookupKV; + +import java.io.IOException; +import java.util.Map; + +public class ExtractorDecorator implements Extractor { + + protected final Extractor decoratedExtractor; + + public ExtractorDecorator(Extractor decoratedExtractor) { + this.decoratedExtractor = decoratedExtractor; + } + + @Override + public Iterable extract(String line) throws IOException { + return decoratedExtractor.extract(line); + } + + @Override + public void initialize(Map config) { + decoratedExtractor.initialize(config); + } +} diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorHandler.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorHandler.java index 0c560ea3ae..0bdd44a177 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorHandler.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorHandler.java @@ -29,53 +29,74 @@ import java.util.Map; public class ExtractorHandler { - final static ObjectMapper _mapper = new ObjectMapper(); - private Map config; - private Extractor extractor; - private InputFormatHandler inputFormatHandler = Formats.BY_LINE; + final static ObjectMapper _mapper = new ObjectMapper(); + private Map config; + private Extractor extractor; + private InputFormatHandler inputFormatHandler = Formats.BY_LINE; - public Map getConfig() { - return config; - } + public Map getConfig() { + return config; + } - public void setConfig(Map config) { - this.config = config; - } + /** + * Set by jackson. Extractor configuration from JSON + */ + public void setConfig(Map config) { + this.config = config; + } - public InputFormatHandler getInputFormatHandler() { - return inputFormatHandler; - } + public InputFormatHandler getInputFormatHandler() { + return inputFormatHandler; + } - public void setInputFormatHandler(String handler) { - try { - this.inputFormatHandler= Formats.create(handler); - } catch (ClassNotFoundException | InstantiationException | IllegalAccessException | NoSuchMethodException | InvocationTargetException e) { - throw new IllegalStateException("Unable to create an inputformathandler", e); - } + /** + * Set by jackson + */ + public void setInputFormatHandler(String handler) { + try { + this.inputFormatHandler = Formats.create(handler); + } catch (ClassNotFoundException | InstantiationException | IllegalAccessException | NoSuchMethodException | InvocationTargetException e) { + throw new IllegalStateException("Unable to create an inputformathandler", e); } + } - public Extractor getExtractor() { - return extractor; - } - public void setExtractor(String extractor) { - try { - // TODO look in config for txformations/filter - // if true, decorate extractors.create - this.extractor = Extractors.create(extractor); - } catch (ClassNotFoundException | IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) { - throw new IllegalStateException("Unable to create an extractor", e); - } - } + public Extractor getExtractor() { + return extractor; + } - public static synchronized ExtractorHandler load(InputStream is) throws IOException { - ExtractorHandler ret = _mapper.readValue(is, ExtractorHandler.class); - ret.getExtractor().initialize(ret.getConfig()); - return ret; - } - public static synchronized ExtractorHandler load(String s, Charset c) throws IOException { - return load( new ByteArrayInputStream(s.getBytes(c))); - } - public static synchronized ExtractorHandler load(String s) throws IOException { - return load( s, Charset.defaultCharset()); + /** + * Set by jackson. + * + * @param extractor Name of extractor to instantiate + */ + public void setExtractor(String extractor) { + try { + this.extractor = Extractors.create(extractor); + } catch (ClassNotFoundException | IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) { + throw new IllegalStateException("Unable to create an extractor", e); } + } + + /** + * Load json configuration + */ + public static synchronized ExtractorHandler load(InputStream is) throws IOException { + ExtractorHandler ret = _mapper.readValue(is, ExtractorHandler.class); + ret.getExtractor().initialize(ret.getConfig()); + return ret; + } + + /** + * Load json configuration + */ + public static synchronized ExtractorHandler load(String s, Charset c) throws IOException { + return load(new ByteArrayInputStream(s.getBytes(c))); + } + + /** + * Load json configuration + */ + public static synchronized ExtractorHandler load(String s) throws IOException { + return load(s, Charset.defaultCharset()); + } } diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/Extractors.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/Extractors.java index 5f63f4e1e4..37693c18d8 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/Extractors.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/Extractors.java @@ -49,7 +49,7 @@ public static Extractor create(String extractorName) throws ClassNotFoundExcepti try { //TODO create decorated extractor here - in init method setup Stellar ExtractorCreator ec = Extractors.valueOf(extractorName); - return ec.create(); + return new TransformFilterExtractorDecorator(ec.create()); } catch(IllegalArgumentException iae) { Extractor ex = (Extractor) Class.forName(extractorName).getConstructor().newInstance(); diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java new file mode 100644 index 0000000000..5b32c27ea2 --- /dev/null +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java @@ -0,0 +1,134 @@ +package org.apache.metron.dataloads.extractor; + +import com.fasterxml.jackson.core.type.TypeReference; +import org.apache.commons.lang.StringUtils; +import org.apache.curator.framework.CuratorFramework; +import org.apache.log4j.Logger; +import org.apache.metron.common.configuration.ConfigurationsUtils; +import org.apache.metron.common.dsl.Context; +import org.apache.metron.common.dsl.MapVariableResolver; +import org.apache.metron.common.dsl.StellarFunctions; +import org.apache.metron.common.stellar.StellarPredicateProcessor; +import org.apache.metron.common.stellar.StellarProcessor; +import org.apache.metron.common.utils.JSONUtils; +import org.apache.metron.enrichment.lookup.LookupKV; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.util.*; + +public class TransformFilterExtractorDecorator extends ExtractorDecorator { + private static final Logger LOG = Logger.getLogger(TransformFilterExtractorDecorator.class); + private static final String TRANSFORMATIONS = "transformations"; + private static final String FILTER = "filter"; + private static final String ZK_QUORUM = "zk_quorum"; + private Map transforms; + private String filterExpression; + private Context stellarContext; + private StellarProcessor transformProcessor; + private StellarPredicateProcessor filterProcessor; + private Map globalConfig; + + public TransformFilterExtractorDecorator(Extractor decoratedExtractor) { + super(decoratedExtractor); + } + + @Override + public void initialize(Map config) { + super.initialize(config); + if (config.containsKey(TRANSFORMATIONS)) { + this.transforms = getTransforms(config.get(TRANSFORMATIONS)); + } else { + this.transforms = new HashMap<>(); + } + if (config.containsKey(FILTER)) { + this.filterExpression = config.get(FILTER).toString(); + } + String zkClientUrl = ""; + if (config.containsKey(ZK_QUORUM)) { + zkClientUrl = config.get(ZK_QUORUM).toString(); + } + Optional zkClient = createClient(zkClientUrl); + this.globalConfig = getGlobalConfig(zkClient); + this.stellarContext = createContext(zkClient); + StellarFunctions.initialize(stellarContext); + this.transformProcessor = new StellarProcessor(); + this.filterProcessor = new StellarPredicateProcessor(); + } + + private Map getTransforms(Object transformsConfig) { + Map transforms = new HashMap<>(); + if (transformsConfig instanceof Map) { + Map map = (Map) transformsConfig; + for (Map.Entry e : map.entrySet()) { + transforms.put(e.getKey().toString(), e.getValue().toString()); + } + } + return transforms; + } + + /** + * Creates a Zookeeper client. + * @param zookeeperUrl The Zookeeper URL. + */ + private Optional createClient(String zookeeperUrl) { + // can only create client, if have valid zookeeper URL + if (StringUtils.isNotBlank(zookeeperUrl)) { + CuratorFramework client = ConfigurationsUtils.getClient(zookeeperUrl); + client.start(); + return Optional.of(client); + } else { + LOG.warn("Unable to setup zookeeper client - zk_quorum url not provided. **This will limit some Stellar functionality**"); + return Optional.empty(); + } + } + + private Map getGlobalConfig(Optional zkClient) { + if (zkClient.isPresent()) { + try { + return JSONUtils.INSTANCE.load( + new ByteArrayInputStream(ConfigurationsUtils.readGlobalConfigBytesFromZookeeper(zkClient.get())), + new TypeReference>() { + }); + } catch (Exception e) { + LOG.warn("Exception thrown while attempting to get global config from Zookeeper.", e); + } + } + return new HashMap<>(); + } + + private Context createContext(Optional zkClient) { + Context.Builder builder = new Context.Builder(); + if (zkClient.isPresent()) { + builder.with(Context.Capabilities.ZOOKEEPER_CLIENT, () -> zkClient.get()) + .with(Context.Capabilities.GLOBAL_CONFIG, () -> globalConfig); + } + return builder.build(); + } + + @Override + public Iterable extract(String line) throws IOException { + List lkvs = new ArrayList<>(); + for (LookupKV lkv : super.extract(line)) { + if (updateLookupKV(lkv)) { + lkvs.add(lkv); + } + } + return lkvs; + } + + private boolean updateLookupKV(LookupKV lkv) { + Map ret = lkv.getValue().getMetadata(); + MapVariableResolver resolver = new MapVariableResolver(ret, globalConfig); + for (Map.Entry entry : transforms.entrySet()) { + Object o = transformProcessor.parse(entry.getValue(), resolver, StellarFunctions.FUNCTION_RESOLVER(), stellarContext); + if (o == null) { + ret.remove(entry.getKey()); + } else { + ret.put(entry.getKey(), o); + } + } + return filterProcessor.parse(filterExpression, resolver, StellarFunctions.FUNCTION_RESOLVER(), stellarContext); + } + +} diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/csv/CSVExtractor.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/csv/CSVExtractor.java index 0cdccd1aee..005225e920 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/csv/CSVExtractor.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/csv/CSVExtractor.java @@ -50,10 +50,10 @@ public int getIndicatorColumn() { return indicatorColumn; } - public LookupConverter getConverter() { return converter; } + @Override public Iterable extract(String line) throws IOException { if(ignore(line)) { @@ -69,8 +69,6 @@ public Iterable extract(String line) throws IOException { return Arrays.asList(new LookupKV(key, converter.toValue(values))); } - - private String getType(String[] tokens) { if(type == null) { return tokens[typeColumnIndex]; @@ -80,8 +78,6 @@ private String getType(String[] tokens) { } } - - @Override public void initialize(Map config) { super.initialize(config); diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java index b4891aa9e0..e7c908a2f8 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java @@ -20,39 +20,27 @@ import org.adrianwalker.multilinestring.Multiline; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.PosixParser; -import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.client.HTable; -import org.apache.hadoop.hbase.client.HTableInterface; +import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.util.GenericOptionsParser; -import org.apache.log4j.PropertyConfigurator; -import org.apache.metron.dataloads.bulk.ThreatIntelBulkLoader; import org.apache.metron.dataloads.extractor.Extractor; import org.apache.metron.dataloads.extractor.ExtractorHandler; -import org.apache.metron.dataloads.extractor.inputformat.WholeFileFormat; -import org.apache.metron.dataloads.nonbulk.flatfile.SimpleEnrichmentFlatFileLoader; import org.apache.metron.dataloads.hbase.mr.HBaseUtil; -import org.apache.metron.enrichment.converter.HbaseConverter; import org.apache.metron.enrichment.converter.EnrichmentConverter; import org.apache.metron.enrichment.converter.EnrichmentKey; import org.apache.metron.enrichment.converter.EnrichmentValue; import org.apache.metron.enrichment.lookup.LookupKV; -import org.apache.metron.common.utils.JSONUtils; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; -import java.io.File; -import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -88,59 +76,106 @@ public class SimpleEnrichmentFlatFileLoaderTest { @Multiline private static String extractorConfig; - @Before - public void setup() throws Exception { - Map.Entry kv = HBaseUtil.INSTANCE.create(true); - config = kv.getValue(); - testUtil = kv.getKey(); - testTable = testUtil.createTable(Bytes.toBytes(tableName), Bytes.toBytes(cf)); - } - - @After - public void teardown() throws Exception { - HBaseUtil.INSTANCE.teardown(testUtil); - } - - @Test - public void testCommandLine() throws Exception { - Configuration conf = HBaseConfiguration.create(); - - String[] argv = {"-c cf", "-t enrichment", "-e extractor.json", "-n enrichment_config.json", "-l log4j", "-i input.csv"}; - String[] otherArgs = new GenericOptionsParser(conf, argv).getRemainingArgs(); - - CommandLine cli = SimpleEnrichmentFlatFileLoader.LoadOptions.parse(new PosixParser(), otherArgs); - Assert.assertEquals(extractorJson,SimpleEnrichmentFlatFileLoader.LoadOptions.EXTRACTOR_CONFIG.get(cli).trim()); - Assert.assertEquals(cf, SimpleEnrichmentFlatFileLoader.LoadOptions.HBASE_CF.get(cli).trim()); - Assert.assertEquals(tableName,SimpleEnrichmentFlatFileLoader.LoadOptions.HBASE_TABLE.get(cli).trim()); - Assert.assertEquals(enrichmentJson,SimpleEnrichmentFlatFileLoader.LoadOptions.ENRICHMENT_CONFIG.get(cli).trim()); - Assert.assertEquals(csvFile,SimpleEnrichmentFlatFileLoader.LoadOptions.INPUT.get(cli).trim()); - Assert.assertEquals(log4jProperty, SimpleEnrichmentFlatFileLoader.LoadOptions.LOG4J_PROPERTIES.get(cli).trim()); + @Before + public void setup() throws Exception { + Map.Entry kv = HBaseUtil.INSTANCE.create(true); + config = kv.getValue(); + testUtil = kv.getKey(); + testTable = testUtil.createTable(Bytes.toBytes(tableName), Bytes.toBytes(cf)); + } + + @After + public void teardown() throws Exception { + HBaseUtil.INSTANCE.teardown(testUtil); + } + + @Test + public void testCommandLine() throws Exception { + Configuration conf = HBaseConfiguration.create(); + + String[] argv = {"-c cf", "-t enrichment", "-e extractor.json", "-n enrichment_config.json", "-l log4j", "-i input.csv"}; + String[] otherArgs = new GenericOptionsParser(conf, argv).getRemainingArgs(); + + CommandLine cli = SimpleEnrichmentFlatFileLoader.LoadOptions.parse(new PosixParser(), otherArgs); + Assert.assertEquals(extractorJson, SimpleEnrichmentFlatFileLoader.LoadOptions.EXTRACTOR_CONFIG.get(cli).trim()); + Assert.assertEquals(cf, SimpleEnrichmentFlatFileLoader.LoadOptions.HBASE_CF.get(cli).trim()); + Assert.assertEquals(tableName, SimpleEnrichmentFlatFileLoader.LoadOptions.HBASE_TABLE.get(cli).trim()); + Assert.assertEquals(enrichmentJson, SimpleEnrichmentFlatFileLoader.LoadOptions.ENRICHMENT_CONFIG.get(cli).trim()); + Assert.assertEquals(csvFile, SimpleEnrichmentFlatFileLoader.LoadOptions.INPUT.get(cli).trim()); + Assert.assertEquals(log4jProperty, SimpleEnrichmentFlatFileLoader.LoadOptions.LOG4J_PROPERTIES.get(cli).trim()); + } + + @Test + public void basicTest() throws Exception { + Assert.assertNotNull(testTable); + String contents = "google.com,1,foo"; + + EnrichmentConverter converter = new EnrichmentConverter(); + ExtractorHandler handler = ExtractorHandler.load(extractorConfig); + Extractor e = handler.getExtractor(); + SimpleEnrichmentFlatFileLoader loader = new SimpleEnrichmentFlatFileLoader(); + testTable.put(loader.extract(contents, e, cf, converter)); + + ResultScanner scanner = testTable.getScanner(Bytes.toBytes(cf)); + List> results = new ArrayList<>(); + for (Result r : scanner) { + results.add(converter.fromResult(r, cf)); } - - @Test - public void test() throws Exception { - - Assert.assertNotNull(testTable); - String contents = "google.com,1,foo"; - - EnrichmentConverter converter = new EnrichmentConverter(); - ExtractorHandler handler = ExtractorHandler.load(extractorConfig); - Extractor e = handler.getExtractor(); - File file = new File (contents); - SimpleEnrichmentFlatFileLoader loader = new SimpleEnrichmentFlatFileLoader(); - testTable.put(loader.extract(contents, e, cf, converter)); - - ResultScanner scanner = testTable.getScanner(Bytes.toBytes(cf)); - List> results = new ArrayList<>(); - for(Result r : scanner) { - results.add(converter.fromResult(r, cf)); - } - Assert.assertEquals(1, results.size()); - Assert.assertEquals(results.get(0).getKey().indicator, "google.com"); - Assert.assertEquals(results.get(0).getKey().type, "enrichment"); - Assert.assertEquals(results.get(0).getValue().getMetadata().size(), 2); - Assert.assertEquals(results.get(0).getValue().getMetadata().get("meta"), "foo"); - Assert.assertEquals(results.get(0).getValue().getMetadata().get("host"), "google.com"); + Assert.assertEquals(1, results.size()); + Assert.assertEquals(results.get(0).getKey().indicator, "google.com"); + Assert.assertEquals(results.get(0).getKey().type, "enrichment"); + Assert.assertEquals(results.get(0).getValue().getMetadata().size(), 2); + Assert.assertEquals(results.get(0).getValue().getMetadata().get("meta"), "foo"); + Assert.assertEquals(results.get(0).getValue().getMetadata().get("host"), "google.com"); + } + + /** + { + "config" : { + "columns" : { + "domain" : 1 + }, + "transformations" : { + "domain" : "TO_UPPER(domain)" + }, + "filter" : "LENGTH(domain) > 0", + "indicator_column" : "domain", + "type" : "topdomain", + "separator" : "," + }, + "extractor" : "CSV" + } + */ + @Multiline + private static String stellarExtractorConfig; + + @Test + public void transforms_fields() throws Exception { + Assert.assertNotNull(testTable); + String[] contents = new String[]{ + "1,google.com", + "2," + }; + + EnrichmentConverter converter = new EnrichmentConverter(); + ExtractorHandler handler = ExtractorHandler.load(stellarExtractorConfig); + Extractor e = handler.getExtractor(); + SimpleEnrichmentFlatFileLoader loader = new SimpleEnrichmentFlatFileLoader(); + List extract = loader.extract(contents[0], e, cf, converter); + testTable.put(extract); + extract = loader.extract(contents[1], e, cf, converter); + testTable.put(extract); + + ResultScanner scanner = testTable.getScanner(Bytes.toBytes(cf)); + List> results = new ArrayList<>(); + for (Result r : scanner) { + results.add(converter.fromResult(r, cf)); } + Assert.assertEquals(1, results.size()); + Assert.assertEquals(results.get(0).getKey().indicator, "google.com"); + Assert.assertEquals(results.get(0).getKey().type, "topdomain"); + Assert.assertEquals(results.get(0).getValue().getMetadata().size(), 1); + Assert.assertEquals(results.get(0).getValue().getMetadata().get("domain"), "GOOGLE.COM"); + } } From 47d814ef95d67738d20ce5dc530ba7b05d418a96 Mon Sep 17 00:00:00 2001 From: cstella Date: Fri, 27 Jan 2017 18:15:44 -0500 Subject: [PATCH 03/32] Multithreading the SimpleEnrichmentFlatFileLoader --- .../common/utils/file/ReaderSpliterator.java | 218 ++++++++++++++++++ .../nonbulk/flatfile/ExtractorState.java | 46 ++++ .../SimpleEnrichmentFlatFileLoader.java | 117 ++++++++-- 3 files changed, 358 insertions(+), 23 deletions(-) create mode 100644 metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/file/ReaderSpliterator.java create mode 100644 metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/ExtractorState.java diff --git a/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/file/ReaderSpliterator.java b/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/file/ReaderSpliterator.java new file mode 100644 index 0000000000..d10146ba65 --- /dev/null +++ b/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/file/ReaderSpliterator.java @@ -0,0 +1,218 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.metron.common.utils.file; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Spliterator; +import java.util.function.Consumer; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import static java.util.Spliterators.spliterator; + +public class ReaderSpliterator implements Spliterator { + private static int characteristics = NONNULL | ORDERED | IMMUTABLE; + private int batchSize ; + private BufferedReader reader; + public ReaderSpliterator(BufferedReader reader) { + this(reader, 128); + } + + public ReaderSpliterator(BufferedReader reader, int batchSize) { + this.batchSize = batchSize; + this.reader = reader; + } + + @Override + public void forEachRemaining(Consumer action) { + if (action == null) { + throw new NullPointerException(); + } + try { + for (String line = null; (line = reader.readLine()) != null;) { + action.accept(line); + } + } catch (RuntimeException e) { + throw e; + } catch (Exception e) { + throw new IllegalStateException(e); + } + } + /** + * If a remaining element exists, performs the given action on it, + * returning {@code true}; else returns {@code false}. If this + * Spliterator is {@link #ORDERED} the action is performed on the + * next element in encounter order. Exceptions thrown by the + * action are relayed to the caller. + * + * @param action The action + * @return {@code false} if no remaining elements existed + * upon entry to this method, else {@code true}. + * @throws NullPointerException if the specified action is null + */ + @Override + public boolean tryAdvance(Consumer action) { + if (action == null) { + throw new NullPointerException(); + } + try { + final String line = reader.readLine(); + if (line == null) { + return false; + } + action.accept(line); + return true; + } catch (RuntimeException e) { + throw e; + } catch (Exception e) { + throw new IllegalStateException(e); + } + } + + /** + * If this spliterator can be partitioned, returns a Spliterator + * covering elements, that will, upon return from this method, not + * be covered by this Spliterator. + *

+ *

If this Spliterator is {@link #ORDERED}, the returned Spliterator + * must cover a strict prefix of the elements. + *

+ *

Unless this Spliterator covers an infinite number of elements, + * repeated calls to {@code trySplit()} must eventually return {@code null}. + * Upon non-null return: + *

    + *
  • the value reported for {@code estimateSize()} before splitting, + * must, after splitting, be greater than or equal to {@code estimateSize()} + * for this and the returned Spliterator; and
  • + *
  • if this Spliterator is {@code SUBSIZED}, then {@code estimateSize()} + * for this spliterator before splitting must be equal to the sum of + * {@code estimateSize()} for this and the returned Spliterator after + * splitting.
  • + *
+ *

+ *

This method may return {@code null} for any reason, + * including emptiness, inability to split after traversal has + * commenced, data structure constraints, and efficiency + * considerations. + * + * @return a {@code Spliterator} covering some portion of the + * elements, or {@code null} if this spliterator cannot be split + * @apiNote An ideal {@code trySplit} method efficiently (without + * traversal) divides its elements exactly in half, allowing + * balanced parallel computation. Many departures from this ideal + * remain highly effective; for example, only approximately + * splitting an approximately balanced tree, or for a tree in + * which leaf nodes may contain either one or two elements, + * failing to further split these nodes. However, large + * deviations in balance and/or overly inefficient {@code + * trySplit} mechanics typically result in poor parallel + * performance. + */ + @Override + public Spliterator trySplit() { + final HoldingConsumer holder = new HoldingConsumer<>(); + if (!tryAdvance(holder)) { + return null; + } + final String[] batch = new String[batchSize]; + int j = 0; + do { + batch[j] = holder.value; + } + while (++j < batchSize && tryAdvance(holder)); + return spliterator(batch, 0, j, characteristics() | SIZED); + } + + /** + * Returns an estimate of the number of elements that would be + * encountered by a {@link #forEachRemaining} traversal, or returns {@link + * Long#MAX_VALUE} if infinite, unknown, or too expensive to compute. + *

+ *

If this Spliterator is {@link #SIZED} and has not yet been partially + * traversed or split, or this Spliterator is {@link #SUBSIZED} and has + * not yet been partially traversed, this estimate must be an accurate + * count of elements that would be encountered by a complete traversal. + * Otherwise, this estimate may be arbitrarily inaccurate, but must decrease + * as specified across invocations of {@link #trySplit}. + * + * @return the estimated size, or {@code Long.MAX_VALUE} if infinite, + * unknown, or too expensive to compute. + * @apiNote Even an inexact estimate is often useful and inexpensive to compute. + * For example, a sub-spliterator of an approximately balanced binary tree + * may return a value that estimates the number of elements to be half of + * that of its parent; if the root Spliterator does not maintain an + * accurate count, it could estimate size to be the power of two + * corresponding to its maximum depth. + */ + @Override + public long estimateSize() { + return Long.MAX_VALUE; + } + + /** + * Returns a set of characteristics of this Spliterator and its + * elements. The result is represented as ORed values from {@link + * #ORDERED}, {@link #DISTINCT}, {@link #SORTED}, {@link #SIZED}, + * {@link #NONNULL}, {@link #IMMUTABLE}, {@link #CONCURRENT}, + * {@link #SUBSIZED}. Repeated calls to {@code characteristics()} on + * a given spliterator, prior to or in-between calls to {@code trySplit}, + * should always return the same result. + *

+ *

If a Spliterator reports an inconsistent set of + * characteristics (either those returned from a single invocation + * or across multiple invocations), no guarantees can be made + * about any computation using this Spliterator. + * + * @return a representation of characteristics + * @apiNote The characteristics of a given spliterator before splitting + * may differ from the characteristics after splitting. For specific + * examples see the characteristic values {@link #SIZED}, {@link #SUBSIZED} + * and {@link #CONCURRENT}. + */ + @Override + public int characteristics() { + return characteristics; + } + + static class HoldingConsumer implements Consumer { + String value; + /** + * Performs this operation on the given argument. + * + * @param string the input argument + */ + @Override + public void accept(String string) { + this.value = string; + } + } + + public static Stream lineStream(BufferedReader in, int batchSize) { + return StreamSupport.stream(new ReaderSpliterator(in, batchSize), false) + .onClose(() -> { + try { + in.close(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + ); + } +} diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/ExtractorState.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/ExtractorState.java new file mode 100644 index 0000000000..e44eb27175 --- /dev/null +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/ExtractorState.java @@ -0,0 +1,46 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.metron.dataloads.nonbulk.flatfile; + +import org.apache.hadoop.hbase.client.HTableInterface; +import org.apache.metron.dataloads.extractor.Extractor; +import org.apache.metron.enrichment.converter.HbaseConverter; + +public class ExtractorState { + private HTableInterface table; + private Extractor extractor; + private HbaseConverter converter; + + public ExtractorState(HTableInterface table, Extractor extractor, HbaseConverter converter) { + this.table = table; + this.extractor = extractor; + this.converter = converter; + } + + public HTableInterface getTable() { + return table; + } + + public Extractor getExtractor() { + return extractor; + } + + public HbaseConverter getConverter() { + return converter; + } +} diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java index 0c7501a9d7..c05d6fd89a 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java @@ -20,6 +20,7 @@ import com.google.common.base.Function; import com.google.common.base.Joiner; import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; import org.apache.commons.cli.*; import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; @@ -28,6 +29,8 @@ import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.log4j.PropertyConfigurator; +import org.apache.metron.common.utils.ConversionUtils; +import org.apache.metron.common.utils.file.ReaderSpliterator; import org.apache.metron.dataloads.extractor.Extractor; import org.apache.metron.dataloads.extractor.ExtractorHandler; import org.apache.metron.dataloads.extractor.inputformat.WholeFileFormat; @@ -39,13 +42,13 @@ import org.apache.metron.common.utils.JSONUtils; import javax.annotation.Nullable; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; +import java.io.*; import java.util.ArrayList; import java.util.List; import java.util.Stack; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ForkJoinPool; +import java.util.stream.Stream; public class SimpleEnrichmentFlatFileLoader { private static abstract class OptionHandler implements Function {} @@ -111,6 +114,26 @@ public Option apply(@Nullable String s) { return o; } }) + ,NUM_THREADS("p", new OptionHandler() { + @Nullable + @Override + public Option apply(@Nullable String s) { + Option o = new Option(s, "threads", true, "The batch size to use for hbase puts"); + o.setArgName("NUM_THREADS"); + o.setRequired(false); + return o; + } + }) + ,BATCH_SIZE("b", new OptionHandler() { + @Nullable + @Override + public Option apply(@Nullable String s) { + Option o = new Option(s, "batchSize", true, "The batch size to use for hbase puts"); + o.setArgName("SIZE"); + o.setRequired(false); + return o; + } + }) ,INPUT("i", new OptionHandler() { @Nullable @Override @@ -207,25 +230,56 @@ public List extract( String line return ret; } - - public void loadFile( File inputFile - , Extractor extractor - , HTableInterface table - , String cf - , HbaseConverter converter - , boolean lineByLine - ) throws IOException + public void load( final Iterable> streams + , final ThreadLocal state + , final String cf + , int numThreads + ) { + System.out.println("Number of threads: " + numThreads); + for(Stream stream : streams) { + try { + ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads); + forkJoinPool.submit(() -> + stream.parallel().forEach(input -> { + ExtractorState es = state.get(); + try { + es.getTable().put(extract(input, es.getExtractor(), cf, es.getConverter())); + } catch (IOException e) { + throw new IllegalStateException("Unable to continue: " + e.getMessage(), e); + } + } + ) + ).get(); + } catch (InterruptedException e) { + throw new IllegalStateException(e.getMessage(), e); + } catch (ExecutionException e) { + throw new IllegalStateException(e.getMessage(), e); + } finally { + stream.close(); + } + } + } + + private static Iterable> streamify(List files, int batchSize, boolean lineByLine) throws FileNotFoundException { + List> ret = new ArrayList<>(); if(!lineByLine) { - table.put(extract(FileUtils.readFileToString(inputFile), extractor, cf, converter)); + ret.add(files.stream().map(f -> { + try { + return FileUtils.readFileToString(f); + } catch (IOException e) { + throw new IllegalStateException("File " + f.getName() + " not found."); + } + })); } else { - BufferedReader br = new BufferedReader(new FileReader(inputFile)); - for(String line = null;(line = br.readLine()) != null;) { - table.put(extract(line, extractor, cf, converter)); + for(File f : files) { + ret.add(ReaderSpliterator.lineStream(new BufferedReader(new FileReader(f)), batchSize)); } } + return ret; } + public static void main(String... argv) throws Exception { Configuration conf = HBaseConfiguration.create(); String[] otherArgs = new GenericOptionsParser(conf, argv).getRemainingArgs(); @@ -237,23 +291,40 @@ public static void main(String... argv) throws Exception { ExtractorHandler handler = ExtractorHandler.load( FileUtils.readFileToString(new File(LoadOptions.EXTRACTOR_CONFIG.get(cli))) ); + int batchSize = 128; + if(LoadOptions.BATCH_SIZE.has(cli)) { + batchSize = ConversionUtils.convert(LoadOptions.BATCH_SIZE.get(cli), Integer.class); + } + int numThreads = Runtime.getRuntime().availableProcessors(); + if(LoadOptions.NUM_THREADS.has(cli)) { + numThreads = ConversionUtils.convert(LoadOptions.NUM_THREADS.get(cli), Integer.class); + } boolean lineByLine = !handler.getInputFormatHandler().getClass().equals(WholeFileFormat.class); - Extractor e = handler.getExtractor(); SensorEnrichmentUpdateConfig sensorEnrichmentUpdateConfig = null; if(LoadOptions.ENRICHMENT_CONFIG.has(cli)) { sensorEnrichmentUpdateConfig = JSONUtils.INSTANCE.load( new File(LoadOptions.ENRICHMENT_CONFIG.get(cli)) , SensorEnrichmentUpdateConfig.class ); } - HbaseConverter converter = new EnrichmentConverter(); List inputFiles = getFiles(new File(LoadOptions.INPUT.get(cli))); SimpleEnrichmentFlatFileLoader loader = new SimpleEnrichmentFlatFileLoader(); - HTableInterface table = loader.getProvider() - .getTable(conf, LoadOptions.HBASE_TABLE.get(cli)); + ThreadLocal state = new ThreadLocal() { + @Override + protected ExtractorState initialValue() { + try { + ExtractorHandler handler = ExtractorHandler.load( + FileUtils.readFileToString(new File(LoadOptions.EXTRACTOR_CONFIG.get(cli))) + ); + HTableInterface table = loader.getProvider().getTable(conf, LoadOptions.HBASE_TABLE.get(cli)); + return new ExtractorState(table, handler.getExtractor(), new EnrichmentConverter()); + } catch (IOException e1) { + throw new IllegalStateException("Unable to get table: " + e1); + } + } + }; + + loader.load(streamify(inputFiles, batchSize, lineByLine), state, LoadOptions.HBASE_CF.get(cli), numThreads); - for (File f : inputFiles) { - loader.loadFile(f, e, table, LoadOptions.HBASE_CF.get(cli), converter, lineByLine); - } if(sensorEnrichmentUpdateConfig != null) { sensorEnrichmentUpdateConfig.updateSensorConfigs(); } From 918d4ce4aea5d7dfde992f32bf049c70f35dd182 Mon Sep 17 00:00:00 2001 From: cstella Date: Fri, 27 Jan 2017 18:23:19 -0500 Subject: [PATCH 04/32] doc changes. --- .../metron-data-management/README.md | 21 ++++++++++--------- .../SimpleEnrichmentFlatFileLoader.java | 4 ++-- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/metron-platform/metron-data-management/README.md b/metron-platform/metron-data-management/README.md index a0c0164686..7a9836cd05 100644 --- a/metron-platform/metron-data-management/README.md +++ b/metron-platform/metron-data-management/README.md @@ -240,16 +240,17 @@ each document to be considered as input to the Extractor. The parameters for the utility are as follows: -| Short Code | Long Code | Is Required? | Description | -|------------|---------------------|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| -h | | No | Generate the help screen/set of options | -| -e | --extractor_config | Yes | JSON Document describing the extractor for this input data source | -| -t | --hbase_table | Yes | The HBase table to import into | -| -c | --hbase_cf | Yes | The HBase table column family to import into | -| -i | --input | Yes | The input data location on local disk. If this is a file, then that file will be loaded. If this is a directory, then the files will be loaded recursively under that directory. | -| -l | --log4j | No | The log4j properties file to load | -| -n | --enrichment_config | No | The JSON document describing the enrichments to configure. Unlike other loaders, this is run first if specified. | - +| Short Code | Long Code | Is Required? | Description | | +|------------|---------------------|--------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---| +| -h | | No | Generate the help screen/set of options | | +| -e | --extractor_config | Yes | JSON Document describing the extractor for this input data source | | +| -t | --hbase_table | Yes | The HBase table to import into | | +| -c | --hbase_cf | Yes | The HBase table column family to import into | | +| -i | --input | Yes | The input data location on local disk. If this is a file, then that file will be loaded. If this is a directory, then the files will be loaded recursively under that directory. | | +| -l | --log4j | No | The log4j properties file to load | | +| -n | --enrichment_config | No | The JSON document describing the enrichments to configure. Unlike other loaders, this is run first if specified. | | +| -p | --threads | No | The number of threads to use when extracting data | | +| -b | --batchSize | No | The batch size to use for HBase puts | | ### GeoLite2 Loader The shell script `$METRON_HOME/bin/geo_enrichment_load.sh` will retrieve MaxMind GeoLite2 data and load data into HDFS, and update the configuration. diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java index c05d6fd89a..a84b5d7b4c 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java @@ -118,7 +118,7 @@ public Option apply(@Nullable String s) { @Nullable @Override public Option apply(@Nullable String s) { - Option o = new Option(s, "threads", true, "The batch size to use for hbase puts"); + Option o = new Option(s, "threads", true, "The number of threads to use when extracting data"); o.setArgName("NUM_THREADS"); o.setRequired(false); return o; @@ -128,7 +128,7 @@ public Option apply(@Nullable String s) { @Nullable @Override public Option apply(@Nullable String s) { - Option o = new Option(s, "batchSize", true, "The batch size to use for hbase puts"); + Option o = new Option(s, "batchSize", true, "The batch size to use for HBase puts"); o.setArgName("SIZE"); o.setRequired(false); return o; From c6ca3a86881eb77bc9598a61e3c0cf8280ccb03f Mon Sep 17 00:00:00 2001 From: cstella Date: Fri, 27 Jan 2017 18:39:56 -0500 Subject: [PATCH 05/32] Updating docs. --- metron-platform/metron-data-management/README.md | 4 ++-- .../nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/metron-platform/metron-data-management/README.md b/metron-platform/metron-data-management/README.md index 7a9836cd05..26dd4721f2 100644 --- a/metron-platform/metron-data-management/README.md +++ b/metron-platform/metron-data-management/README.md @@ -249,8 +249,8 @@ The parameters for the utility are as follows: | -i | --input | Yes | The input data location on local disk. If this is a file, then that file will be loaded. If this is a directory, then the files will be loaded recursively under that directory. | | | -l | --log4j | No | The log4j properties file to load | | | -n | --enrichment_config | No | The JSON document describing the enrichments to configure. Unlike other loaders, this is run first if specified. | | -| -p | --threads | No | The number of threads to use when extracting data | | -| -b | --batchSize | No | The batch size to use for HBase puts | | +| -p | --threads | No | The number of threads to use when extracting data. The default is the number of cores. | | +| -b | --batchSize | No | The batch size to use for HBase puts | | ### GeoLite2 Loader The shell script `$METRON_HOME/bin/geo_enrichment_load.sh` will retrieve MaxMind GeoLite2 data and load data into HDFS, and update the configuration. diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java index a84b5d7b4c..24c16cc82d 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java @@ -118,7 +118,7 @@ public Option apply(@Nullable String s) { @Nullable @Override public Option apply(@Nullable String s) { - Option o = new Option(s, "threads", true, "The number of threads to use when extracting data"); + Option o = new Option(s, "threads", true, "The number of threads to use when extracting data. The default is the number of cores of your machine."); o.setArgName("NUM_THREADS"); o.setRequired(false); return o; From 8c9a79cdfa38ea2fbd161095d5e346147558ec5f Mon Sep 17 00:00:00 2001 From: cstella Date: Fri, 27 Jan 2017 22:36:31 -0500 Subject: [PATCH 06/32] Investigating integration tests. --- .../integration/ElasticsearchIndexingIntegrationTest.java | 1 + .../metron/indexing/integration/IndexingIntegrationTest.java | 2 ++ 2 files changed, 3 insertions(+) diff --git a/metron-platform/metron-elasticsearch/src/test/java/org/apache/metron/elasticsearch/integration/ElasticsearchIndexingIntegrationTest.java b/metron-platform/metron-elasticsearch/src/test/java/org/apache/metron/elasticsearch/integration/ElasticsearchIndexingIntegrationTest.java index 7e9f23129d..acc1565e5c 100644 --- a/metron-platform/metron-elasticsearch/src/test/java/org/apache/metron/elasticsearch/integration/ElasticsearchIndexingIntegrationTest.java +++ b/metron-platform/metron-elasticsearch/src/test/java/org/apache/metron/elasticsearch/integration/ElasticsearchIndexingIntegrationTest.java @@ -85,6 +85,7 @@ public ReadinessState process(ComponentRunner runner) { return ReadinessState.READY; } } else { + System.out.println("Missed index..."); return ReadinessState.NOT_READY; } } diff --git a/metron-platform/metron-indexing/src/test/java/org/apache/metron/indexing/integration/IndexingIntegrationTest.java b/metron-platform/metron-indexing/src/test/java/org/apache/metron/indexing/integration/IndexingIntegrationTest.java index 03ae9ffd8f..a93c442ab0 100644 --- a/metron-platform/metron-indexing/src/test/java/org/apache/metron/indexing/integration/IndexingIntegrationTest.java +++ b/metron-platform/metron-indexing/src/test/java/org/apache/metron/indexing/integration/IndexingIntegrationTest.java @@ -205,6 +205,7 @@ public void test() throws Exception { private void waitForIndex(String zookeeperQuorum) throws Exception { try(CuratorFramework client = getClient(zookeeperQuorum)) { client.start(); + System.out.println("Waiting for zookeeper..."); byte[] bytes = null; do { try { @@ -216,6 +217,7 @@ private void waitForIndex(String zookeeperQuorum) throws Exception { } } while(bytes == null || bytes.length == 0); + System.out.println("Found index config in zookeeper..."); } } From 315bd181aa634290ab987441d81c28addb7952e2 Mon Sep 17 00:00:00 2001 From: cstella Date: Fri, 27 Jan 2017 23:09:28 -0500 Subject: [PATCH 07/32] Update integration test to be a proper integration test. --- .../SimpleEnrichmentFlatFileLoader.java | 1 - .../SimpleEnrichmentFlatFileLoaderTest.java | 180 ++++++++++-------- 2 files changed, 99 insertions(+), 82 deletions(-) diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java index 24c16cc82d..9992422db3 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java @@ -236,7 +236,6 @@ public void load( final Iterable> streams , int numThreads ) { - System.out.println("Number of threads: " + numThreads); for(Stream stream : streams) { try { ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads); diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java index b4891aa9e0..4ffb91a537 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java @@ -17,6 +17,7 @@ */ package org.apache.metron.dataloads.nonbulk.flatfile; +import com.google.common.collect.ImmutableList; import org.adrianwalker.multilinestring.Multiline; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.PosixParser; @@ -56,91 +57,108 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.stream.Stream; public class SimpleEnrichmentFlatFileLoaderTest { - private HBaseTestingUtility testUtil; - - /** The test table. */ - private HTable testTable; - private String tableName = "enrichment"; - private String cf = "cf"; - private String csvFile="input.csv"; - private String extractorJson = "extractor.json"; - private String enrichmentJson = "enrichment_config.json"; - private String log4jProperty = "log4j"; - - Configuration config = null; - /** - { - "config" : { - "columns" : { - "host" : 0, - "meta" : 2 - }, - "indicator_column" : "host", - "separator" : ",", - "type" : "enrichment" - }, - "extractor" : "CSV" - } - */ - @Multiline - private static String extractorConfig; - - @Before - public void setup() throws Exception { - Map.Entry kv = HBaseUtil.INSTANCE.create(true); - config = kv.getValue(); - testUtil = kv.getKey(); - testTable = testUtil.createTable(Bytes.toBytes(tableName), Bytes.toBytes(cf)); + private HBaseTestingUtility testUtil; + + /** The test table. */ + private HTable testTable; + private String tableName = "enrichment"; + private String cf = "cf"; + private String csvFile="input.csv"; + private String extractorJson = "extractor.json"; + private String enrichmentJson = "enrichment_config.json"; + private String log4jProperty = "log4j"; + + Configuration config = null; + /** + { + "config" : { + "columns" : { + "host" : 0, + "meta" : 2 + }, + "indicator_column" : "host", + "separator" : ",", + "type" : "enrichment" + }, + "extractor" : "CSV" } - - @After - public void teardown() throws Exception { - HBaseUtil.INSTANCE.teardown(testUtil); - } - - @Test - public void testCommandLine() throws Exception { - Configuration conf = HBaseConfiguration.create(); - - String[] argv = {"-c cf", "-t enrichment", "-e extractor.json", "-n enrichment_config.json", "-l log4j", "-i input.csv"}; - String[] otherArgs = new GenericOptionsParser(conf, argv).getRemainingArgs(); - - CommandLine cli = SimpleEnrichmentFlatFileLoader.LoadOptions.parse(new PosixParser(), otherArgs); - Assert.assertEquals(extractorJson,SimpleEnrichmentFlatFileLoader.LoadOptions.EXTRACTOR_CONFIG.get(cli).trim()); - Assert.assertEquals(cf, SimpleEnrichmentFlatFileLoader.LoadOptions.HBASE_CF.get(cli).trim()); - Assert.assertEquals(tableName,SimpleEnrichmentFlatFileLoader.LoadOptions.HBASE_TABLE.get(cli).trim()); - Assert.assertEquals(enrichmentJson,SimpleEnrichmentFlatFileLoader.LoadOptions.ENRICHMENT_CONFIG.get(cli).trim()); - Assert.assertEquals(csvFile,SimpleEnrichmentFlatFileLoader.LoadOptions.INPUT.get(cli).trim()); - Assert.assertEquals(log4jProperty, SimpleEnrichmentFlatFileLoader.LoadOptions.LOG4J_PROPERTIES.get(cli).trim()); - } - - @Test - public void test() throws Exception { - - Assert.assertNotNull(testTable); - String contents = "google.com,1,foo"; - - EnrichmentConverter converter = new EnrichmentConverter(); - ExtractorHandler handler = ExtractorHandler.load(extractorConfig); - Extractor e = handler.getExtractor(); - File file = new File (contents); - SimpleEnrichmentFlatFileLoader loader = new SimpleEnrichmentFlatFileLoader(); - testTable.put(loader.extract(contents, e, cf, converter)); - - ResultScanner scanner = testTable.getScanner(Bytes.toBytes(cf)); - List> results = new ArrayList<>(); - for(Result r : scanner) { - results.add(converter.fromResult(r, cf)); - } - Assert.assertEquals(1, results.size()); - Assert.assertEquals(results.get(0).getKey().indicator, "google.com"); - Assert.assertEquals(results.get(0).getKey().type, "enrichment"); - Assert.assertEquals(results.get(0).getValue().getMetadata().size(), 2); - Assert.assertEquals(results.get(0).getValue().getMetadata().get("meta"), "foo"); - Assert.assertEquals(results.get(0).getValue().getMetadata().get("host"), "google.com"); + */ + @Multiline + private static String extractorConfig; + + @Before + public void setup() throws Exception { + Map.Entry kv = HBaseUtil.INSTANCE.create(true); + config = kv.getValue(); + testUtil = kv.getKey(); + testTable = testUtil.createTable(Bytes.toBytes(tableName), Bytes.toBytes(cf)); + } + + @After + public void teardown() throws Exception { + HBaseUtil.INSTANCE.teardown(testUtil); + } + + @Test + public void testCommandLine() throws Exception { + Configuration conf = HBaseConfiguration.create(); + + String[] argv = { "-c cf", "-t enrichment" + , "-e extractor.json", "-n enrichment_config.json" + , "-l log4j", "-i input.csv" + , "-p 2", "-b 128" + }; + String[] otherArgs = new GenericOptionsParser(conf, argv).getRemainingArgs(); + + CommandLine cli = SimpleEnrichmentFlatFileLoader.LoadOptions.parse(new PosixParser(), otherArgs); + Assert.assertEquals(extractorJson,SimpleEnrichmentFlatFileLoader.LoadOptions.EXTRACTOR_CONFIG.get(cli).trim()); + Assert.assertEquals(cf, SimpleEnrichmentFlatFileLoader.LoadOptions.HBASE_CF.get(cli).trim()); + Assert.assertEquals(tableName,SimpleEnrichmentFlatFileLoader.LoadOptions.HBASE_TABLE.get(cli).trim()); + Assert.assertEquals(enrichmentJson,SimpleEnrichmentFlatFileLoader.LoadOptions.ENRICHMENT_CONFIG.get(cli).trim()); + Assert.assertEquals(csvFile,SimpleEnrichmentFlatFileLoader.LoadOptions.INPUT.get(cli).trim()); + Assert.assertEquals(log4jProperty, SimpleEnrichmentFlatFileLoader.LoadOptions.LOG4J_PROPERTIES.get(cli).trim()); + Assert.assertEquals("2", SimpleEnrichmentFlatFileLoader.LoadOptions.NUM_THREADS.get(cli).trim()); + Assert.assertEquals("128", SimpleEnrichmentFlatFileLoader.LoadOptions.BATCH_SIZE.get(cli).trim()); + } + + @Test + public void test() throws Exception { + + Assert.assertNotNull(testTable); + String contents = "google.com,1,foo"; + + EnrichmentConverter converter = new EnrichmentConverter(); + ExtractorHandler handler = ExtractorHandler.load(extractorConfig); + Extractor e = handler.getExtractor(); + SimpleEnrichmentFlatFileLoader loader = new SimpleEnrichmentFlatFileLoader(); + Stream contentStreams = ImmutableList.of(contents).stream(); + ThreadLocal state = new ThreadLocal() { + @Override + protected ExtractorState initialValue() { + return new ExtractorState(testTable, e, converter); + } + }; + loader.load(ImmutableList.of(contentStreams) + , state + , cf + , 2 + ); + + ResultScanner scanner = testTable.getScanner(Bytes.toBytes(cf)); + List> results = new ArrayList<>(); + for(Result r : scanner) { + results.add(converter.fromResult(r, cf)); } + Assert.assertEquals(1, results.size()); + Assert.assertEquals(results.get(0).getKey().indicator, "google.com"); + Assert.assertEquals(results.get(0).getKey().type, "enrichment"); + Assert.assertEquals(results.get(0).getValue().getMetadata().size(), 2); + Assert.assertEquals(results.get(0).getValue().getMetadata().get("meta"), "foo"); + Assert.assertEquals(results.get(0).getValue().getMetadata().get("host"), "google.com"); + } } From 004c6f41b6c1cc3ecea70513e1a468501bd32e3c Mon Sep 17 00:00:00 2001 From: cstella Date: Fri, 27 Jan 2017 23:49:37 -0500 Subject: [PATCH 08/32] Adding spliterator unit test for completeness --- .../utils/file/ReaderSpliteratorTest.java | 128 ++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/file/ReaderSpliteratorTest.java diff --git a/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/file/ReaderSpliteratorTest.java b/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/file/ReaderSpliteratorTest.java new file mode 100644 index 0000000000..b1ddede4eb --- /dev/null +++ b/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/file/ReaderSpliteratorTest.java @@ -0,0 +1,128 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.metron.common.utils.file; + +import org.adrianwalker.multilinestring.Multiline; +import org.junit.Assert; +import org.junit.Test; + +import java.io.BufferedReader; +import java.io.StringReader; +import java.util.Map; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ForkJoinPool; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +public class ReaderSpliteratorTest { + /** + foo + bar + grok + foo + the + and + grok + foo + bar + */ + @Multiline + public static String data; + + @Test + public void testParallelStreamSmallBatch() { + Stream stream = ReaderSpliterator.lineStream(new BufferedReader(new StringReader(data)), 2); + Map count = + stream.parallel().map( s -> s.trim()) + .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); + Assert.assertEquals(5, count.size()); + Assert.assertEquals(3, (int)count.get("foo")); + Assert.assertEquals(2, (int)count.get("bar")); + Assert.assertEquals(1, (int)count.get("and")); + Assert.assertEquals(1, (int)count.get("the")); + } + + @Test + public void testParallelStreamLargeBatch() { + Stream stream = ReaderSpliterator.lineStream(new BufferedReader(new StringReader(data)), 100); + Map count = + stream.parallel().map( s -> s.trim()) + .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); + Assert.assertEquals(5, count.size()); + Assert.assertEquals(3, (int)count.get("foo")); + Assert.assertEquals(2, (int)count.get("bar")); + Assert.assertEquals(1, (int)count.get("and")); + Assert.assertEquals(1, (int)count.get("the")); + } + + @Test + public void testSequentialStreamLargeBatch() { + Stream stream = ReaderSpliterator.lineStream(new BufferedReader(new StringReader(data)), 100); + Map count = + stream.map( s -> s.trim()) + .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); + Assert.assertEquals(5, count.size()); + Assert.assertEquals(3, (int)count.get("foo")); + Assert.assertEquals(2, (int)count.get("bar")); + Assert.assertEquals(1, (int)count.get("and")); + Assert.assertEquals(1, (int)count.get("the")); + } + + @Test + public void testActuallyParallel() throws ExecutionException, InterruptedException { + //With 9 elements and a batch of 2, we should only ceil(9/2) = 5 batches, so at most min(5, 2) = 2 threads will be used + Stream stream = ReaderSpliterator.lineStream(new BufferedReader(new StringReader(data)), 2); + ForkJoinPool forkJoinPool = new ForkJoinPool(2); + forkJoinPool.submit(() -> { + Map threads= + stream.parallel().map(s -> Thread.currentThread().getName()) + .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); + Assert.assertTrue(threads.size() <= 2); + } + ).get(); + } + + @Test + public void testActuallyParallel_mediumBatch() throws ExecutionException, InterruptedException { + //With 9 elements and a batch of 2, we should only ceil(9/2) = 5 batches, so at most 5 threads of the pool of 10 will be used + Stream stream = ReaderSpliterator.lineStream(new BufferedReader(new StringReader(data)), 2); + ForkJoinPool forkJoinPool = new ForkJoinPool(10); + forkJoinPool.submit(() -> { + Map threads= + stream.parallel().map(s -> Thread.currentThread().getName()) + .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); + Assert.assertTrue(threads.size() <= (int)Math.ceil(9.0/2) && threads.size() > 1); + } + ).get(); + } + + @Test + public void testActuallyParallel_bigBatch() throws ExecutionException, InterruptedException { + //With 9 elements and a batch of 10, we should only have one batch, so only one thread will be used + //despite the thread pool size of 2. + Stream stream = ReaderSpliterator.lineStream(new BufferedReader(new StringReader(data)), 10); + ForkJoinPool forkJoinPool = new ForkJoinPool(2); + forkJoinPool.submit(() -> { + Map threads= + stream.parallel().map(s -> Thread.currentThread().getName()) + .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); + Assert.assertEquals(1, threads.size()); + } + ).get(); + } +} From f8dd48ef920c948e1fc5ff736e386f641e551b2b Mon Sep 17 00:00:00 2001 From: cstella Date: Sat, 28 Jan 2017 00:01:42 -0500 Subject: [PATCH 09/32] Updating test to use a proper file --- .../utils/file/ReaderSpliteratorTest.java | 150 ++++++++++-------- 1 file changed, 88 insertions(+), 62 deletions(-) diff --git a/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/file/ReaderSpliteratorTest.java b/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/file/ReaderSpliteratorTest.java index b1ddede4eb..cf259d8fb2 100644 --- a/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/file/ReaderSpliteratorTest.java +++ b/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/file/ReaderSpliteratorTest.java @@ -19,10 +19,14 @@ import org.adrianwalker.multilinestring.Multiline; import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; import org.junit.Test; -import java.io.BufferedReader; -import java.io.StringReader; +import java.io.*; +import java.nio.file.Files; +import java.nio.file.OpenOption; +import java.nio.file.StandardOpenOption; import java.util.Map; import java.util.concurrent.ExecutionException; import java.util.concurrent.ForkJoinPool; @@ -43,86 +47,108 @@ public class ReaderSpliteratorTest { */ @Multiline public static String data; + public static final File dataFile = new File("target/readerspliteratortest.data"); + + @BeforeClass + public static void setup() throws IOException { + if(dataFile.exists()) { + dataFile.delete(); + } + Files.write(dataFile.toPath(), data.getBytes(), StandardOpenOption.CREATE_NEW, StandardOpenOption.TRUNCATE_EXISTING); + dataFile.deleteOnExit(); + } + + public static BufferedReader getReader() throws FileNotFoundException { + return new BufferedReader(new FileReader(dataFile)); + } @Test - public void testParallelStreamSmallBatch() { - Stream stream = ReaderSpliterator.lineStream(new BufferedReader(new StringReader(data)), 2); - Map count = - stream.parallel().map( s -> s.trim()) - .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); - Assert.assertEquals(5, count.size()); - Assert.assertEquals(3, (int)count.get("foo")); - Assert.assertEquals(2, (int)count.get("bar")); - Assert.assertEquals(1, (int)count.get("and")); - Assert.assertEquals(1, (int)count.get("the")); + public void testParallelStreamSmallBatch() throws FileNotFoundException { + try( Stream stream = ReaderSpliterator.lineStream(getReader(), 2)) { + + Map count = + stream.parallel().map( s -> s.trim()) + .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); + Assert.assertEquals(5, count.size()); + Assert.assertEquals(3, (int)count.get("foo")); + Assert.assertEquals(2, (int)count.get("bar")); + Assert.assertEquals(1, (int)count.get("and")); + Assert.assertEquals(1, (int)count.get("the")); + } } @Test - public void testParallelStreamLargeBatch() { - Stream stream = ReaderSpliterator.lineStream(new BufferedReader(new StringReader(data)), 100); - Map count = - stream.parallel().map( s -> s.trim()) - .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); - Assert.assertEquals(5, count.size()); - Assert.assertEquals(3, (int)count.get("foo")); - Assert.assertEquals(2, (int)count.get("bar")); - Assert.assertEquals(1, (int)count.get("and")); - Assert.assertEquals(1, (int)count.get("the")); + public void testParallelStreamLargeBatch() throws FileNotFoundException { + try( Stream stream = ReaderSpliterator.lineStream(getReader(), 100)) { + Map count = + stream.parallel().map(s -> s.trim()) + .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); + Assert.assertEquals(5, count.size()); + Assert.assertEquals(3, (int) count.get("foo")); + Assert.assertEquals(2, (int) count.get("bar")); + Assert.assertEquals(1, (int) count.get("and")); + Assert.assertEquals(1, (int) count.get("the")); + } } @Test - public void testSequentialStreamLargeBatch() { - Stream stream = ReaderSpliterator.lineStream(new BufferedReader(new StringReader(data)), 100); - Map count = - stream.map( s -> s.trim()) - .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); - Assert.assertEquals(5, count.size()); - Assert.assertEquals(3, (int)count.get("foo")); - Assert.assertEquals(2, (int)count.get("bar")); - Assert.assertEquals(1, (int)count.get("and")); - Assert.assertEquals(1, (int)count.get("the")); + public void testSequentialStreamLargeBatch() throws FileNotFoundException { + try( Stream stream = ReaderSpliterator.lineStream(getReader(), 100)) { + Map count = + stream.map(s -> s.trim()) + .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); + Assert.assertEquals(5, count.size()); + Assert.assertEquals(3, (int) count.get("foo")); + Assert.assertEquals(2, (int) count.get("bar")); + Assert.assertEquals(1, (int) count.get("and")); + Assert.assertEquals(1, (int) count.get("the")); + } } @Test - public void testActuallyParallel() throws ExecutionException, InterruptedException { + public void testActuallyParallel() throws ExecutionException, InterruptedException, FileNotFoundException { //With 9 elements and a batch of 2, we should only ceil(9/2) = 5 batches, so at most min(5, 2) = 2 threads will be used - Stream stream = ReaderSpliterator.lineStream(new BufferedReader(new StringReader(data)), 2); - ForkJoinPool forkJoinPool = new ForkJoinPool(2); - forkJoinPool.submit(() -> { - Map threads= - stream.parallel().map(s -> Thread.currentThread().getName()) - .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); - Assert.assertTrue(threads.size() <= 2); - } - ).get(); + try( Stream stream = ReaderSpliterator.lineStream(getReader(), 2)) { + ForkJoinPool forkJoinPool = new ForkJoinPool(2); + forkJoinPool.submit(() -> { + Map threads = + stream.parallel().map(s -> Thread.currentThread().getName()) + .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); + Assert.assertTrue(threads.size() <= 2); + } + ).get(); + } } @Test - public void testActuallyParallel_mediumBatch() throws ExecutionException, InterruptedException { + public void testActuallyParallel_mediumBatch() throws ExecutionException, InterruptedException, FileNotFoundException { //With 9 elements and a batch of 2, we should only ceil(9/2) = 5 batches, so at most 5 threads of the pool of 10 will be used - Stream stream = ReaderSpliterator.lineStream(new BufferedReader(new StringReader(data)), 2); - ForkJoinPool forkJoinPool = new ForkJoinPool(10); - forkJoinPool.submit(() -> { - Map threads= - stream.parallel().map(s -> Thread.currentThread().getName()) - .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); - Assert.assertTrue(threads.size() <= (int)Math.ceil(9.0/2) && threads.size() > 1); - } - ).get(); + try( Stream stream = ReaderSpliterator.lineStream(getReader(), 2)) { + ForkJoinPool forkJoinPool = new ForkJoinPool(10); + forkJoinPool.submit(() -> { + Map threads = + stream.parallel().map(s -> Thread.currentThread().getName()) + .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); + Assert.assertTrue(threads.size() <= (int) Math.ceil(9.0 / 2) && threads.size() > 1); + } + ).get(); + } } @Test - public void testActuallyParallel_bigBatch() throws ExecutionException, InterruptedException { + public void testActuallyParallel_bigBatch() throws ExecutionException, InterruptedException, FileNotFoundException { //With 9 elements and a batch of 10, we should only have one batch, so only one thread will be used //despite the thread pool size of 2. - Stream stream = ReaderSpliterator.lineStream(new BufferedReader(new StringReader(data)), 10); - ForkJoinPool forkJoinPool = new ForkJoinPool(2); - forkJoinPool.submit(() -> { - Map threads= - stream.parallel().map(s -> Thread.currentThread().getName()) - .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); - Assert.assertEquals(1, threads.size()); - } - ).get(); + try( Stream stream = ReaderSpliterator.lineStream(getReader(), 10)) { + ForkJoinPool forkJoinPool = new ForkJoinPool(2); + forkJoinPool.submit(() -> { + Map threads = + stream.parallel().map(s -> Thread.currentThread().getName()) + .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); + Assert.assertEquals(1, threads.size()); + } + ).get(); + } } + } From 9b04f9723d442c8f4fb7a8bcaa1d733fc1305dc4 Mon Sep 17 00:00:00 2001 From: cstella Date: Sat, 28 Jan 2017 00:17:12 -0500 Subject: [PATCH 10/32] Updating docs and renaming a few things. --- .../common/utils/file/ReaderSpliterator.java | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/file/ReaderSpliterator.java b/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/file/ReaderSpliterator.java index d10146ba65..ebb4fad207 100644 --- a/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/file/ReaderSpliterator.java +++ b/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/file/ReaderSpliterator.java @@ -27,6 +27,16 @@ import static java.util.Spliterators.spliterator; +/** + * A Spliterator which works well on sequential streams by constructing a + * fixed batch size split rather than inheriting the spliterator from BufferedReader.lines() + * which gives up and reports no size and has no strategy for batching. This is a bug + * in Java 8 and will be fixed in Java 9. + * + * The ideas have been informed by https://www.airpair.com/java/posts/parallel-processing-of-io-based-data-with-java-streams + * except more specific to strings and motivated by a JDK 8 bug as + * described at http://bytefish.de/blog/jdk8_files_lines_parallel_stream/ + */ public class ReaderSpliterator implements Spliterator { private static int characteristics = NONNULL | ORDERED | IMMUTABLE; private int batchSize ; @@ -127,7 +137,7 @@ public boolean tryAdvance(Consumer action) { */ @Override public Spliterator trySplit() { - final HoldingConsumer holder = new HoldingConsumer<>(); + final ConsumerWithLookback holder = new ConsumerWithLookback(); if (!tryAdvance(holder)) { return null; } @@ -191,7 +201,7 @@ public int characteristics() { return characteristics; } - static class HoldingConsumer implements Consumer { + static class ConsumerWithLookback implements Consumer { String value; /** * Performs this operation on the given argument. From eb5b82cc35bd767a169f548ea8144dd9ae165f84 Mon Sep 17 00:00:00 2001 From: cstella Date: Sat, 28 Jan 2017 00:23:25 -0500 Subject: [PATCH 11/32] Update one more test case. --- .../common/utils/file/ReaderSpliterator.java | 6 +++- .../utils/file/ReaderSpliteratorTest.java | 31 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/file/ReaderSpliterator.java b/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/file/ReaderSpliterator.java index ebb4fad207..20a40fa76f 100644 --- a/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/file/ReaderSpliterator.java +++ b/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/file/ReaderSpliterator.java @@ -215,7 +215,11 @@ public void accept(String string) { } public static Stream lineStream(BufferedReader in, int batchSize) { - return StreamSupport.stream(new ReaderSpliterator(in, batchSize), false) + return lineStream(in, batchSize, false); + } + + public static Stream lineStream(BufferedReader in, int batchSize, boolean isParallel) { + return StreamSupport.stream(new ReaderSpliterator(in, batchSize), isParallel) .onClose(() -> { try { in.close(); diff --git a/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/file/ReaderSpliteratorTest.java b/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/file/ReaderSpliteratorTest.java index cf259d8fb2..965840f360 100644 --- a/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/file/ReaderSpliteratorTest.java +++ b/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/file/ReaderSpliteratorTest.java @@ -135,6 +135,37 @@ public void testActuallyParallel_mediumBatch() throws ExecutionException, Interr } } + @Test + public void testActuallyParallel_mediumBatchNotImplicitlyParallel() throws ExecutionException, InterruptedException, FileNotFoundException { + //Since this is not parallel and we're not making the stream itself parallel, we should only use one thread from the thread pool. + try( Stream stream = ReaderSpliterator.lineStream(getReader(), 2, false)) { + ForkJoinPool forkJoinPool = new ForkJoinPool(10); + forkJoinPool.submit(() -> { + Map threads = + stream.map(s -> Thread.currentThread().getName()) + .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); + Assert.assertTrue(threads.size() == 1); + } + ).get(); + } + } + + @Test + public void testActuallyParallel_mediumBatchImplicitlyParallel() throws ExecutionException, InterruptedException, FileNotFoundException { + //With 9 elements and a batch of 2, we should only ceil(9/2) = 5 batches, so at most 5 threads of the pool of 10 will be used + //despite not calling .parallel() on the stream, we are constructing the stream to be implicitly parallel + try( Stream stream = ReaderSpliterator.lineStream(getReader(), 2, true)) { + ForkJoinPool forkJoinPool = new ForkJoinPool(10); + forkJoinPool.submit(() -> { + Map threads = + stream.map(s -> Thread.currentThread().getName()) + .collect(Collectors.toMap(s -> s, s -> 1, Integer::sum)); + Assert.assertTrue(threads.size() <= (int) Math.ceil(9.0 / 2) && threads.size() > 1); + } + ).get(); + } + } + @Test public void testActuallyParallel_bigBatch() throws ExecutionException, InterruptedException, FileNotFoundException { //With 9 elements and a batch of 10, we should only have one batch, so only one thread will be used From 81c42afa2ff619ca23bfa5ec546c94ee8d6063e5 Mon Sep 17 00:00:00 2001 From: Michael Miklavcic Date: Mon, 30 Jan 2017 09:09:52 -0700 Subject: [PATCH 12/32] partial commit - adding additional filter and transform for indicator --- .../TransformFilterExtractorDecorator.java | 34 +++++++++++++------ .../SimpleEnrichmentFlatFileLoaderTest.java | 8 +++-- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java index 5b32c27ea2..fd1b4a3047 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java @@ -19,11 +19,15 @@ public class TransformFilterExtractorDecorator extends ExtractorDecorator { private static final Logger LOG = Logger.getLogger(TransformFilterExtractorDecorator.class); - private static final String TRANSFORMATIONS = "transformations"; - private static final String FILTER = "filter"; + private static final String VALUE_TRANSFORM = "value_transform"; + private static final String VALUE_FILTER = "value_filter"; + private static final String INDICATOR_TRANSFORM = "indicator_transform"; + private static final String INDICATOR_FILTER = "indicator_filter"; private static final String ZK_QUORUM = "zk_quorum"; - private Map transforms; - private String filterExpression; + private Map valueTransforms; + private Map indicatorTransforms; + private String valueFilter; + private String indicatorFilter; private Context stellarContext; private StellarProcessor transformProcessor; private StellarPredicateProcessor filterProcessor; @@ -36,13 +40,21 @@ public TransformFilterExtractorDecorator(Extractor decoratedExtractor) { @Override public void initialize(Map config) { super.initialize(config); - if (config.containsKey(TRANSFORMATIONS)) { - this.transforms = getTransforms(config.get(TRANSFORMATIONS)); + if (config.containsKey(VALUE_TRANSFORM)) { + this.valueTransforms = getTransforms(config.get(VALUE_TRANSFORM)); } else { - this.transforms = new HashMap<>(); + this.valueTransforms = new HashMap<>(); } - if (config.containsKey(FILTER)) { - this.filterExpression = config.get(FILTER).toString(); + if (config.containsKey(INDICATOR_TRANSFORM)) { + this.indicatorTransforms = getTransforms(config.get(INDICATOR_TRANSFORM)); + } else { + this.indicatorTransforms = new HashMap<>(); + } + if (config.containsKey(VALUE_FILTER)) { + this.valueFilter = config.get(VALUE_FILTER).toString(); + } + if (config.containsKey(INDICATOR_FILTER)) { + this.indicatorFilter = config.get(INDICATOR_FILTER).toString(); } String zkClientUrl = ""; if (config.containsKey(ZK_QUORUM)) { @@ -120,7 +132,7 @@ public Iterable extract(String line) throws IOException { private boolean updateLookupKV(LookupKV lkv) { Map ret = lkv.getValue().getMetadata(); MapVariableResolver resolver = new MapVariableResolver(ret, globalConfig); - for (Map.Entry entry : transforms.entrySet()) { + for (Map.Entry entry : valueTransforms.entrySet()) { Object o = transformProcessor.parse(entry.getValue(), resolver, StellarFunctions.FUNCTION_RESOLVER(), stellarContext); if (o == null) { ret.remove(entry.getKey()); @@ -128,7 +140,7 @@ private boolean updateLookupKV(LookupKV lkv) { ret.put(entry.getKey(), o); } } - return filterProcessor.parse(filterExpression, resolver, StellarFunctions.FUNCTION_RESOLVER(), stellarContext); + return filterProcessor.parse(valueFilter, resolver, StellarFunctions.FUNCTION_RESOLVER(), stellarContext); } } diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java index e7c908a2f8..bae6c83b6a 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java @@ -135,10 +135,14 @@ public void basicTest() throws Exception { "columns" : { "domain" : 1 }, - "transformations" : { + "value_transform" : { "domain" : "TO_UPPER(domain)" }, - "filter" : "LENGTH(domain) > 0", + "indicator_transform" : { + "domain" : "TO_UPPER(domain)" + }, + "value_filter" : "LENGTH(domain) > 0", + "indicator_filter" : "LENGTH(domain) > 0", "indicator_column" : "domain", "type" : "topdomain", "separator" : "," From 3f6e3ba4f30e41c94ff25027f1fd7c839ea6c9bf Mon Sep 17 00:00:00 2001 From: cstella Date: Tue, 31 Jan 2017 10:39:03 -0500 Subject: [PATCH 13/32] Updating simple enrichment flat file loader to be complete. --- .../extractor/inputformat/Formats.java | 50 ++-- .../inputformat/InputFormatHandler.java | 7 +- .../inputformat/WholeFileFormat.java | 123 ++++---- .../nonbulk/flatfile/ImportStrategy.java | 33 +++ .../dataloads/nonbulk/flatfile/Importer.java | 15 + .../nonbulk/flatfile/LoadOptions.java | 229 +++++++++++++++ .../nonbulk/flatfile/LocalImporter.java | 247 ++++++++++++++++ .../nonbulk/flatfile/MapReduceImporter.java | 54 ++++ .../nonbulk/flatfile/OptionHandler.java | 14 + .../SimpleEnrichmentFlatFileLoader.java | 270 +----------------- .../SimpleEnrichmentFlatFileLoaderTest.java | 31 +- 11 files changed, 703 insertions(+), 370 deletions(-) create mode 100644 metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/ImportStrategy.java create mode 100644 metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/Importer.java create mode 100644 metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java create mode 100644 metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LocalImporter.java create mode 100644 metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/MapReduceImporter.java create mode 100644 metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/OptionHandler.java diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/inputformat/Formats.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/inputformat/Formats.java index b8be2338a6..961e7d3785 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/inputformat/Formats.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/inputformat/Formats.java @@ -23,34 +23,34 @@ import java.io.IOException; import java.lang.reflect.InvocationTargetException; +import java.util.List; import java.util.Map; +import java.util.stream.Collectors; -public enum Formats implements InputFormatHandler{ - BY_LINE(new InputFormatHandler() { - @Override - public void set(Job job, Path input, Map config) throws IOException { +public enum Formats implements InputFormatHandler { + BY_LINE( (job, inputs, config) -> { + for(Path input : inputs) { + FileInputFormat.addInputPath(job, input); + } + }), + WHOLE_FILE( new WholeFileFormat()); + InputFormatHandler _handler = null; + Formats(InputFormatHandler handler) { + this._handler = handler; + } + @Override + public void set(Job job, List path, Map config) throws IOException { + _handler.set(job, path, config); + } - FileInputFormat.addInputPath(job, input); - } - }) - ; - InputFormatHandler _handler = null; - Formats(InputFormatHandler handler) { - this._handler = handler; + public static InputFormatHandler create(String handlerName) throws ClassNotFoundException, IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException { + try { + InputFormatHandler ec = Formats.valueOf(handlerName)._handler; + return ec; } - @Override - public void set(Job job, Path path, Map config) throws IOException { - _handler.set(job, path, config); - } - - public static InputFormatHandler create(String handlerName) throws ClassNotFoundException, IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException { - try { - InputFormatHandler ec = Formats.valueOf(handlerName); - return ec; - } - catch(IllegalArgumentException iae) { - InputFormatHandler ex = (InputFormatHandler) Class.forName(handlerName).getConstructor().newInstance(); - return ex; - } + catch(IllegalArgumentException iae) { + InputFormatHandler ex = (InputFormatHandler) Class.forName(handlerName).getConstructor().newInstance(); + return ex; } + } } diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/inputformat/InputFormatHandler.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/inputformat/InputFormatHandler.java index 2287969df7..00e89c0b41 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/inputformat/InputFormatHandler.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/inputformat/InputFormatHandler.java @@ -17,12 +17,17 @@ */ package org.apache.metron.dataloads.extractor.inputformat; +import com.google.common.collect.ImmutableList; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import java.io.IOException; +import java.util.List; import java.util.Map; public interface InputFormatHandler { - void set(Job job, Path input, Map config) throws IOException; + void set(Job job, List input, Map config) throws IOException; + default void set(Job job, Path input, Map config) throws IOException { + set(job, ImmutableList.of(input), config); + } } diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/inputformat/WholeFileFormat.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/inputformat/WholeFileFormat.java index e0a58efbec..5dc8b53989 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/inputformat/WholeFileFormat.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/inputformat/WholeFileFormat.java @@ -30,80 +30,83 @@ import org.apache.hadoop.mapreduce.lib.input.FileSplit; import java.io.IOException; +import java.util.List; import java.util.Map; public class WholeFileFormat implements InputFormatHandler { - public static class WholeFileRecordReader extends RecordReader { - private FileSplit fileSplit; - private Configuration conf; - private Text value = new Text(); - private boolean processed = false; + public static class WholeFileRecordReader extends RecordReader { + private FileSplit fileSplit; + private Configuration conf; + private Text value = new Text(); + private boolean processed = false; - @Override - public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { - this.fileSplit = (FileSplit) split; - this.conf = context.getConfiguration(); - } - - @Override - public boolean nextKeyValue() throws IOException, InterruptedException { - if (!processed) { - byte[] contents = new byte[(int) fileSplit.getLength()]; - Path file = fileSplit.getPath(); - FileSystem fs = file.getFileSystem(conf); - FSDataInputStream in = null; - try { - in = fs.open(file); - IOUtils.readFully(in, contents, 0, contents.length); - value.set(contents, 0, contents.length); - } finally { - IOUtils.closeStream(in); - } - processed = true; - return true; - } - return false; - } + @Override + public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { + this.fileSplit = (FileSplit) split; + this.conf = context.getConfiguration(); + } - @Override - public NullWritable getCurrentKey() throws IOException, InterruptedException { - return NullWritable.get(); - } - @Override - public Text getCurrentValue() throws IOException, InterruptedException{ - return value; + @Override + public boolean nextKeyValue() throws IOException, InterruptedException { + if (!processed) { + byte[] contents = new byte[(int) fileSplit.getLength()]; + Path file = fileSplit.getPath(); + FileSystem fs = file.getFileSystem(conf); + FSDataInputStream in = null; + try { + in = fs.open(file); + IOUtils.readFully(in, contents, 0, contents.length); + value.set(contents, 0, contents.length); + } finally { + IOUtils.closeStream(in); } + processed = true; + return true; + } + return false; + } - @Override - public float getProgress() throws IOException { - return processed ? 1.0f : 0.0f; - } + @Override + public NullWritable getCurrentKey() throws IOException, InterruptedException { + return NullWritable.get(); + } + @Override + public Text getCurrentValue() throws IOException, InterruptedException{ + return value; + } - @Override - public void close() throws IOException{ - //do nothing :) - } + @Override + public float getProgress() throws IOException { + return processed ? 1.0f : 0.0f; } - public static class WholeFileInputFormat extends FileInputFormat { + @Override + public void close() throws IOException{ + //do nothing :) + } + } - @Override - protected boolean isSplitable(JobContext context, Path file) { - return false; - } + public static class WholeFileInputFormat extends FileInputFormat { - @Override - public RecordReader createRecordReader( - InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { - WholeFileRecordReader reader = new WholeFileRecordReader(); - reader.initialize(split, context); - return reader; - } + @Override + protected boolean isSplitable(JobContext context, Path file) { + return false; } + @Override - public void set(Job job, Path input, Map config) throws IOException { - WholeFileInputFormat.setInputPaths(job, input); - job.setInputFormatClass(WholeFileInputFormat.class); + public RecordReader createRecordReader( + InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { + WholeFileRecordReader reader = new WholeFileRecordReader(); + reader.initialize(split, context); + return reader; + } + } + @Override + public void set(Job job, List inputs, Map config) throws IOException { + for(Path input : inputs) { + WholeFileInputFormat.addInputPath(job, input); } + job.setInputFormatClass(WholeFileInputFormat.class); + } } diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/ImportStrategy.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/ImportStrategy.java new file mode 100644 index 0000000000..148cfe8c55 --- /dev/null +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/ImportStrategy.java @@ -0,0 +1,33 @@ +package org.apache.metron.dataloads.nonbulk.flatfile; + +import com.google.common.base.Joiner; + +import java.util.List; +import java.util.Optional; + +public enum ImportStrategy { + LOCAL(LocalImporter.INSTANCE), + MR(MapReduceImporter.INSTANCE) + ; + private Importer importer; + + ImportStrategy(Importer importer) { + this.importer = importer; + } + + public Importer getImporter() { + return importer; + } + + public static Optional getStrategy(String strategyName) { + if(strategyName == null) { + return Optional.empty(); + } + for(ImportStrategy strategy : values()) { + if(strategy.name().equalsIgnoreCase(strategyName)) { + return Optional.of(strategy); + } + } + return Optional.empty(); + } +} diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/Importer.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/Importer.java new file mode 100644 index 0000000000..05810f44f3 --- /dev/null +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/Importer.java @@ -0,0 +1,15 @@ +package org.apache.metron.dataloads.nonbulk.flatfile; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.metron.dataloads.extractor.ExtractorHandler; +import org.apache.metron.enrichment.converter.EnrichmentConverter; + +import java.io.IOException; +import java.util.EnumMap; +import java.util.List; +import java.util.Optional; + +public interface Importer { + void importData(EnumMap> config, ExtractorHandler handler , final Configuration hadoopConfig) throws IOException; +} diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java new file mode 100644 index 0000000000..c7cf71c78a --- /dev/null +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java @@ -0,0 +1,229 @@ +package org.apache.metron.dataloads.nonbulk.flatfile; + +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import org.apache.commons.cli.*; +import org.apache.commons.io.FileUtils; +import org.apache.metron.common.utils.ConversionUtils; + +import javax.annotation.Nullable; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.EnumMap; +import java.util.List; +import java.util.Optional; + +public enum LoadOptions { + HELP("h", new OptionHandler() { + + @Nullable + @Override + public Option apply(@Nullable String s) { + return new Option(s, "help", false, "Generate Help screen"); + } + }) + , IMPORT_MODE("m", new OptionHandler() { + @Nullable + @Override + public Option apply(@Nullable String s) { + Option o = new Option(s, "import_mode", true + , "The Import mode to use: " + Joiner.on(",").join(ImportStrategy.values()) + + ". Default: " + ImportStrategy.LOCAL + ); + o.setArgName("MODE"); + o.setRequired(false); + return o; + } + + @Override + public Optional getValue(LoadOptions option, CommandLine cli) { + String mode = option.get(cli); + return Optional.of(ImportStrategy.getStrategy(mode).orElse(ImportStrategy.LOCAL)); + } + }) + ,HBASE_TABLE("t", new OptionHandler() { + @Nullable + @Override + public Option apply(@Nullable String s) { + Option o = new Option(s, "hbase_table", true, "HBase table to ingest the data into."); + o.setArgName("TABLE"); + o.setRequired(true); + return o; + } + + @Override + public Optional getValue(LoadOptions option, CommandLine cli) { + return Optional.ofNullable(option.get(cli)); + } + }) + ,HBASE_CF("c", new OptionHandler() { + @Nullable + @Override + public Option apply(@Nullable String s) { + Option o = new Option(s, "hbase_cf", true, "HBase column family to ingest the data into."); + o.setArgName("CF"); + o.setRequired(true); + return o; + } + + @Override + public Optional getValue(LoadOptions option, CommandLine cli) { + return Optional.ofNullable(option.get(cli)); + } + }) + ,EXTRACTOR_CONFIG("e", new OptionHandler() { + @Nullable + @Override + public Option apply(@Nullable String s) { + Option o = new Option(s, "extractor_config", true, "JSON Document describing the extractor for this input data source"); + o.setArgName("JSON_FILE"); + o.setRequired(true); + return o; + } + + @Override + public Optional getValue(LoadOptions option, CommandLine cli) { + try { + return Optional.ofNullable(FileUtils.readFileToString(new File(option.get(cli)))); + } catch (IOException e) { + throw new IllegalStateException("Unable to retrieve extractor config from " + option.get(cli) + ": " + e.getMessage(), e); + } + } + }) + ,ENRICHMENT_CONFIG("n", new OptionHandler() { + @Nullable + @Override + public Option apply(@Nullable String s) { + Option o = new Option(s, "enrichment_config", true + , "JSON Document describing the enrichment configuration details." + + " This is used to associate an enrichment type with a field type in zookeeper." + ); + o.setArgName("JSON_FILE"); + o.setRequired(false); + return o; + } + }) + ,LOG4J_PROPERTIES("l", new OptionHandler() { + @Nullable + @Override + public Option apply(@Nullable String s) { + Option o = new Option(s, "log4j", true, "The log4j properties file to load"); + o.setArgName("FILE"); + o.setRequired(false); + return o; + } + }) + ,NUM_THREADS("p", new OptionHandler() { + @Nullable + @Override + public Option apply(@Nullable String s) { + Option o = new Option(s, "threads", true, "The number of threads to use when extracting data. The default is the number of cores of your machine."); + o.setArgName("NUM_THREADS"); + o.setRequired(false); + return o; + } + + @Override + public Optional getValue(LoadOptions option, CommandLine cli) { + int numThreads = Runtime.getRuntime().availableProcessors(); + if(option.has(cli)) { + numThreads = ConversionUtils.convert(option.get(cli), Integer.class); + } + return Optional.of(numThreads); + } + }) + ,BATCH_SIZE("b", new OptionHandler() { + @Nullable + @Override + public Option apply(@Nullable String s) { + Option o = new Option(s, "batchSize", true, "The batch size to use for HBase puts"); + o.setArgName("SIZE"); + o.setRequired(false); + return o; + } + + @Override + public Optional getValue(LoadOptions option, CommandLine cli) { + int batchSize = 128; + if(option.has(cli)) { + batchSize = ConversionUtils.convert(option.get(cli), Integer.class); + } + return Optional.of(batchSize); + } + }) + ,INPUT("i", new OptionHandler() { + @Nullable + @Override + public Option apply(@Nullable String s) { + Option o = new Option(s, "input", true, "The CSV File to load"); + o.setArgName("FILE"); + o.setRequired(true); + return o; + } + + @Override + public Optional getValue(LoadOptions option, CommandLine cli) { + List inputs = new ArrayList<>(); + for(String input : Splitter.on(",").split(Optional.ofNullable(option.get(cli)).orElse(""))) { + inputs.add(input.trim()); + } + return Optional.of(inputs); + } + }) + ; + Option option; + String shortCode; + OptionHandler handler; + LoadOptions(String shortCode, OptionHandler optionHandler) { + this.shortCode = shortCode; + this.handler = optionHandler; + this.option = optionHandler.apply(shortCode); + } + + public boolean has(CommandLine cli) { + return cli.hasOption(shortCode); + } + + public String get(CommandLine cli) { + return cli.getOptionValue(shortCode); + } + + public static CommandLine parse(CommandLineParser parser, String[] args) { + try { + CommandLine cli = parser.parse(getOptions(), args); + if(HELP.has(cli)) { + printHelp(); + System.exit(0); + } + return cli; + } catch (ParseException e) { + System.err.println("Unable to parse args: " + Joiner.on(' ').join(args)); + e.printStackTrace(System.err); + printHelp(); + System.exit(-1); + return null; + } + } + + public static EnumMap > createConfig(CommandLine cli) { + EnumMap > ret = new EnumMap<>(LoadOptions.class); + for(LoadOptions option : values()) { + ret.put(option, option.handler.getValue(option, cli)); + } + return ret; + } + + public static void printHelp() { + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp( "SimpleEnrichmentFlatFileLoader", getOptions()); + } + + public static Options getOptions() { + Options ret = new Options(); + for(LoadOptions o : LoadOptions.values()) { + ret.addOption(o.option); + } + return ret; + } +} diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LocalImporter.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LocalImporter.java new file mode 100644 index 0000000000..e87d63bc93 --- /dev/null +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LocalImporter.java @@ -0,0 +1,247 @@ +package org.apache.metron.dataloads.nonbulk.flatfile; + +import com.google.common.collect.ImmutableList; +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.client.HTableInterface; +import org.apache.hadoop.hbase.client.Put; +import org.apache.metron.common.utils.file.ReaderSpliterator; +import org.apache.metron.dataloads.extractor.Extractor; +import org.apache.metron.dataloads.extractor.ExtractorHandler; +import org.apache.metron.dataloads.extractor.inputformat.WholeFileFormat; +import org.apache.metron.enrichment.converter.EnrichmentConverter; +import org.apache.metron.enrichment.converter.HbaseConverter; +import org.apache.metron.enrichment.lookup.LookupKV; +import org.apache.metron.hbase.HTableProvider; + +import java.io.*; +import java.nio.file.Files; +import java.util.*; +import java.util.concurrent.ForkJoinPool; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static com.sun.tools.javac.jvm.ByteCodes.ret; + +public enum LocalImporter implements Importer { + INSTANCE; + private static ThreadLocal fs = new ThreadLocal(){ + + @Override + protected FileSystem initialValue() { + try { + return FileSystem.get(new Configuration()); + } catch (IOException e) { + throw new IllegalStateException("Unable to retrieve the filesystem: " + e.getMessage(), e); + } + } + }; + + /** + * Location can be either a local file or a file on HDFS. + */ + private static final class Location { + private String loc; + private boolean isLocal; + + public Location(String loc) { + this(loc, !loc.startsWith("hdfs://")); + } + public Location(String loc, boolean isLocal) { + this.loc = loc; + this.isLocal = isLocal; + } + + public Optional> getChildren() throws IOException { + if(exists() && isDirectory()) { + List children = new ArrayList<>(); + for(String child : list().orElse(new ArrayList<>())) { + children.add(new Location(child, isLocal)); + } + return Optional.of(children); + } + else { + return Optional.empty(); + } + } + + private Optional> list() throws IOException { + List children = new ArrayList<>(); + if(isLocal) { + for(File f : new File(loc).listFiles()) { + children.add(f.getPath()); + } + } + else { + for(FileStatus f : fs.get().listStatus(new Path(loc)) ) { + children.add(f.getPath().toString()); + } + } + return Optional.of(children); + } + + public boolean exists() throws IOException { + if(isLocal) { + return new File(loc).exists(); + } + else { + return fs.get().exists(new Path(loc)); + } + } + + public boolean isDirectory() throws IOException { + if(isLocal) { + return new File(loc).isDirectory(); + } + else { + return fs.get().isDirectory(new Path(loc)); + } + } + + public BufferedReader openReader() throws IOException { + if(isLocal) { + return new BufferedReader(new FileReader(new File(loc))); + } + else { + return new BufferedReader(new InputStreamReader(fs.get().open(new Path(loc)))); + } + } + + @Override + public String toString() { + return loc; + } + } + + public interface HTableProviderRetriever { + HTableProvider retrieve(); + } + + + @Override + public void importData( final EnumMap> config + , final ExtractorHandler handler + , final Configuration hadoopConfig + ) throws IOException { + importData(config, handler, hadoopConfig, () -> new HTableProvider()); + + } + public void importData( final EnumMap> config + , final ExtractorHandler handler + , final Configuration hadoopConfig + , final HTableProviderRetriever provider + ) throws IOException { + ThreadLocal state = new ThreadLocal() { + @Override + protected ExtractorState initialValue() { + try { + HTableInterface table = provider.retrieve().getTable(hadoopConfig, (String) config.get(LoadOptions.HBASE_TABLE).get()); + return new ExtractorState(table, handler.getExtractor(), new EnrichmentConverter()); + } catch (IOException e1) { + throw new IllegalStateException("Unable to get table: " + e1); + } + } + }; + + boolean lineByLine = !handler.getInputFormatHandler().getClass().equals(WholeFileFormat.class); + List inputs = (List) config.get(LoadOptions.INPUT).get(); + String cf = (String) config.get(LoadOptions.HBASE_CF).get(); + if(!lineByLine) { + extractWholeFiles(inputs, state, cf); + } + else { + int batchSize = (int) config.get(LoadOptions.BATCH_SIZE).get(); + int numThreads = (int) config.get(LoadOptions.NUM_THREADS).get(); + extractLineByLine(inputs, state, cf, batchSize, numThreads); + } + + } + + public void extractLineByLine( List inputs + , ThreadLocal state + , String cf + , int batchSize + , int numThreads + ) throws IOException { + inputs.stream().map(input -> new Location(input)) + .forEach( loc -> { + try (Stream stream = ReaderSpliterator.lineStream(loc.openReader(), batchSize)) { + + ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads); + forkJoinPool.submit(() -> + stream.parallel().forEach(input -> { + ExtractorState es = state.get(); + try { + es.getTable().put(extract(input, es.getExtractor(), cf, es.getConverter())); + } catch (IOException e) { + throw new IllegalStateException("Unable to continue: " + e.getMessage(), e); + } + } + ) + ).get(); + } catch (Exception e) { + throw new IllegalStateException(e.getMessage(), e); + } + } + ); + } + + public void extractWholeFiles( List inputs, ThreadLocal state, String cf) throws IOException { + final List locations = new ArrayList<>(); + fileVisitor(inputs, loc -> locations.add(loc)); + locations.parallelStream().forEach(loc -> { + try(BufferedReader br = loc.openReader()) { + String s = br.lines().collect(Collectors.joining()); + state.get().getTable().put(extract(s, state.get().getExtractor(), cf, state.get().getConverter())); + } catch (IOException e) { + throw new IllegalStateException("Unable to read " + loc + ": " + e.getMessage(), e); + } + }); + } + + + public List extract(String line + , Extractor extractor + , String cf + , HbaseConverter converter + ) throws IOException + { + List ret = new ArrayList<>(); + Iterable kvs = extractor.extract(line); + for(LookupKV kv : kvs) { + Put put = converter.toPut(cf, kv.getKey(), kv.getValue()); + ret.add(put); + } + return ret; + } + + + public void fileVisitor(List inputs + , final Consumer importConsumer + ) throws IOException { + Stack stack = new Stack<>(); + for(String input : inputs) { + Location loc = new Location(input); + if(loc.exists()) { + stack.add(loc); + } + } + while(!stack.empty()) { + Location loc = stack.pop(); + if(loc.isDirectory()) { + for(Location child : loc.getChildren().orElse(Collections.emptyList())) { + stack.push(child); + } + } + else { + importConsumer.accept(loc); + } + } + } + +} diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/MapReduceImporter.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/MapReduceImporter.java new file mode 100644 index 0000000000..b9bcc49a3c --- /dev/null +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/MapReduceImporter.java @@ -0,0 +1,54 @@ +package org.apache.metron.dataloads.nonbulk.flatfile; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.mapreduce.TableOutputFormat; +import org.apache.hadoop.mapreduce.Job; +import org.apache.metron.dataloads.extractor.ExtractorHandler; +import org.apache.metron.dataloads.hbase.mr.BulkLoadMapper; +import org.apache.metron.enrichment.converter.EnrichmentConverter; + +import java.io.IOException; +import java.util.EnumMap; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + + +public enum MapReduceImporter implements Importer{ + INSTANCE + ; + + @Override + public void importData(EnumMap> config + , ExtractorHandler handler + , Configuration hadoopConfig + ) throws IOException { + String table = (String) config.get(LoadOptions.HBASE_TABLE).get(); + String cf = (String) config.get(LoadOptions.HBASE_CF).get(); + String extractorConfigContents = (String) config.get(LoadOptions.EXTRACTOR_CONFIG).get(); + Job job = new Job(hadoopConfig); + List inputs = (List) config.get(LoadOptions.INPUT).get(); + job.setJobName("MapReduceImporter: " + inputs.stream().collect(Collectors.joining(",")) + " => " + table + ":" + cf); + System.out.println("Configuring " + job.getJobName()); + job.setJarByClass(MapReduceImporter.class); + job.setMapperClass(org.apache.metron.dataloads.hbase.mr.BulkLoadMapper.class); + job.setOutputFormatClass(TableOutputFormat.class); + job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, table); + job.getConfiguration().set(BulkLoadMapper.COLUMN_FAMILY_KEY, cf); + job.getConfiguration().set(BulkLoadMapper.CONFIG_KEY, extractorConfigContents); + job.getConfiguration().set(BulkLoadMapper.CONVERTER_KEY, EnrichmentConverter.class.getName()); + job.setOutputKeyClass(ImmutableBytesWritable.class); + job.setOutputValueClass(Put.class); + job.setNumReduceTasks(0); + List paths = inputs.stream().map(p -> new Path(p)).collect(Collectors.toList()); + handler.getInputFormatHandler().set(job, paths, handler.getConfig()); + try { + job.waitForCompletion(true); + } catch (Exception e) { + throw new IllegalStateException("Unable to complete job: " + e.getMessage(), e); + } + } +} diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/OptionHandler.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/OptionHandler.java new file mode 100644 index 0000000000..9b6ef6d201 --- /dev/null +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/OptionHandler.java @@ -0,0 +1,14 @@ +package org.apache.metron.dataloads.nonbulk.flatfile; + +import com.google.common.base.Function; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.Option; + +import java.util.Optional; + +public abstract class OptionHandler implements Function +{ + public Optional getValue(LoadOptions option, CommandLine cli) { + return Optional.empty(); + } +} diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java index 9992422db3..f74d4108d9 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java @@ -17,10 +17,7 @@ */ package org.apache.metron.dataloads.nonbulk.flatfile; -import com.google.common.base.Function; -import com.google.common.base.Joiner; import com.google.common.collect.ImmutableList; -import com.google.common.collect.Iterables; import org.apache.commons.cli.*; import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; @@ -41,288 +38,37 @@ import org.apache.metron.enrichment.lookup.LookupKV; import org.apache.metron.common.utils.JSONUtils; -import javax.annotation.Nullable; import java.io.*; -import java.util.ArrayList; -import java.util.List; -import java.util.Stack; +import java.util.*; import java.util.concurrent.ExecutionException; import java.util.concurrent.ForkJoinPool; import java.util.stream.Stream; public class SimpleEnrichmentFlatFileLoader { - private static abstract class OptionHandler implements Function {} - public static enum LoadOptions { - HELP("h", new OptionHandler() { - @Nullable - @Override - public Option apply(@Nullable String s) { - return new Option(s, "help", false, "Generate Help screen"); - } - }) - ,HBASE_TABLE("t", new OptionHandler() { - @Nullable - @Override - public Option apply(@Nullable String s) { - Option o = new Option(s, "hbase_table", true, "HBase table to ingest the data into."); - o.setArgName("TABLE"); - o.setRequired(true); - return o; - } - }) - ,HBASE_CF("c", new OptionHandler() { - @Nullable - @Override - public Option apply(@Nullable String s) { - Option o = new Option(s, "hbase_cf", true, "HBase column family to ingest the data into."); - o.setArgName("CF"); - o.setRequired(true); - return o; - } - }) - ,EXTRACTOR_CONFIG("e", new OptionHandler() { - @Nullable - @Override - public Option apply(@Nullable String s) { - Option o = new Option(s, "extractor_config", true, "JSON Document describing the extractor for this input data source"); - o.setArgName("JSON_FILE"); - o.setRequired(true); - return o; - } - }) - ,ENRICHMENT_CONFIG("n", new OptionHandler() { - @Nullable - @Override - public Option apply(@Nullable String s) { - Option o = new Option(s, "enrichment_config", true - , "JSON Document describing the enrichment configuration details." + - " This is used to associate an enrichment type with a field type in zookeeper." - ); - o.setArgName("JSON_FILE"); - o.setRequired(false); - return o; - } - }) - ,LOG4J_PROPERTIES("l", new OptionHandler() { - @Nullable - @Override - public Option apply(@Nullable String s) { - Option o = new Option(s, "log4j", true, "The log4j properties file to load"); - o.setArgName("FILE"); - o.setRequired(false); - return o; - } - }) - ,NUM_THREADS("p", new OptionHandler() { - @Nullable - @Override - public Option apply(@Nullable String s) { - Option o = new Option(s, "threads", true, "The number of threads to use when extracting data. The default is the number of cores of your machine."); - o.setArgName("NUM_THREADS"); - o.setRequired(false); - return o; - } - }) - ,BATCH_SIZE("b", new OptionHandler() { - @Nullable - @Override - public Option apply(@Nullable String s) { - Option o = new Option(s, "batchSize", true, "The batch size to use for HBase puts"); - o.setArgName("SIZE"); - o.setRequired(false); - return o; - } - }) - ,INPUT("i", new OptionHandler() { - @Nullable - @Override - public Option apply(@Nullable String s) { - Option o = new Option(s, "input", true, "The CSV File to load"); - o.setArgName("FILE"); - o.setRequired(true); - return o; - } - }) - ; - Option option; - String shortCode; - LoadOptions(String shortCode, OptionHandler optionHandler) { - this.shortCode = shortCode; - this.option = optionHandler.apply(shortCode); - } - - public boolean has(CommandLine cli) { - return cli.hasOption(shortCode); - } - - public String get(CommandLine cli) { - return cli.getOptionValue(shortCode); - } - - public static CommandLine parse(CommandLineParser parser, String[] args) { - try { - CommandLine cli = parser.parse(getOptions(), args); - if(HELP.has(cli)) { - printHelp(); - System.exit(0); - } - return cli; - } catch (ParseException e) { - System.err.println("Unable to parse args: " + Joiner.on(' ').join(args)); - e.printStackTrace(System.err); - printHelp(); - System.exit(-1); - return null; - } - } - - public static void printHelp() { - HelpFormatter formatter = new HelpFormatter(); - formatter.printHelp( "SimpleEnrichmentFlatFileLoader", getOptions()); - } - - public static Options getOptions() { - Options ret = new Options(); - for(LoadOptions o : LoadOptions.values()) { - ret.addOption(o.option); - } - return ret; - } - } - public static List getFiles(File root) { - if(!root.isDirectory()) { - return ImmutableList.of(root); - } - List ret = new ArrayList<>(); - Stack stack = new Stack(); - stack.push(root); - while(!stack.isEmpty()) { - File f = stack.pop(); - if(f.isDirectory()) { - for(File child : f.listFiles()) { - stack.push(child); - } - } - else { - ret.add(f); - } - } - return ret; - } - - public HTableProvider getProvider() { - return new HTableProvider(); - } - - public List extract( String line - , Extractor extractor - , String cf - , HbaseConverter converter - ) throws IOException - { - List ret = new ArrayList<>(); - Iterable kvs = extractor.extract(line); - for(LookupKV kv : kvs) { - Put put = converter.toPut(cf, kv.getKey(), kv.getValue()); - ret.add(put); - } - return ret; - } - - public void load( final Iterable> streams - , final ThreadLocal state - , final String cf - , int numThreads - ) - { - for(Stream stream : streams) { - try { - ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads); - forkJoinPool.submit(() -> - stream.parallel().forEach(input -> { - ExtractorState es = state.get(); - try { - es.getTable().put(extract(input, es.getExtractor(), cf, es.getConverter())); - } catch (IOException e) { - throw new IllegalStateException("Unable to continue: " + e.getMessage(), e); - } - } - ) - ).get(); - } catch (InterruptedException e) { - throw new IllegalStateException(e.getMessage(), e); - } catch (ExecutionException e) { - throw new IllegalStateException(e.getMessage(), e); - } finally { - stream.close(); - } - } - } - - private static Iterable> streamify(List files, int batchSize, boolean lineByLine) throws FileNotFoundException { - List> ret = new ArrayList<>(); - if(!lineByLine) { - ret.add(files.stream().map(f -> { - try { - return FileUtils.readFileToString(f); - } catch (IOException e) { - throw new IllegalStateException("File " + f.getName() + " not found."); - } - })); - } - else { - for(File f : files) { - ret.add(ReaderSpliterator.lineStream(new BufferedReader(new FileReader(f)), batchSize)); - } - } - return ret; - } public static void main(String... argv) throws Exception { - Configuration conf = HBaseConfiguration.create(); - String[] otherArgs = new GenericOptionsParser(conf, argv).getRemainingArgs(); + Configuration hadoopConfig = HBaseConfiguration.create(); + String[] otherArgs = new GenericOptionsParser(hadoopConfig, argv).getRemainingArgs(); CommandLine cli = LoadOptions.parse(new PosixParser(), otherArgs); + EnumMap> config = LoadOptions.createConfig(cli); if(LoadOptions.LOG4J_PROPERTIES.has(cli)) { PropertyConfigurator.configure(LoadOptions.LOG4J_PROPERTIES.get(cli)); } ExtractorHandler handler = ExtractorHandler.load( FileUtils.readFileToString(new File(LoadOptions.EXTRACTOR_CONFIG.get(cli))) ); - int batchSize = 128; - if(LoadOptions.BATCH_SIZE.has(cli)) { - batchSize = ConversionUtils.convert(LoadOptions.BATCH_SIZE.get(cli), Integer.class); - } - int numThreads = Runtime.getRuntime().availableProcessors(); - if(LoadOptions.NUM_THREADS.has(cli)) { - numThreads = ConversionUtils.convert(LoadOptions.NUM_THREADS.get(cli), Integer.class); - } - boolean lineByLine = !handler.getInputFormatHandler().getClass().equals(WholeFileFormat.class); + ImportStrategy strategy = (ImportStrategy) config.get(LoadOptions.IMPORT_MODE).get(); + strategy.getImporter().importData(config, handler, hadoopConfig); + + SensorEnrichmentUpdateConfig sensorEnrichmentUpdateConfig = null; if(LoadOptions.ENRICHMENT_CONFIG.has(cli)) { sensorEnrichmentUpdateConfig = JSONUtils.INSTANCE.load( new File(LoadOptions.ENRICHMENT_CONFIG.get(cli)) , SensorEnrichmentUpdateConfig.class ); } - List inputFiles = getFiles(new File(LoadOptions.INPUT.get(cli))); - SimpleEnrichmentFlatFileLoader loader = new SimpleEnrichmentFlatFileLoader(); - ThreadLocal state = new ThreadLocal() { - @Override - protected ExtractorState initialValue() { - try { - ExtractorHandler handler = ExtractorHandler.load( - FileUtils.readFileToString(new File(LoadOptions.EXTRACTOR_CONFIG.get(cli))) - ); - HTableInterface table = loader.getProvider().getTable(conf, LoadOptions.HBASE_TABLE.get(cli)); - return new ExtractorState(table, handler.getExtractor(), new EnrichmentConverter()); - } catch (IOException e1) { - throw new IllegalStateException("Unable to get table: " + e1); - } - } - }; - - loader.load(streamify(inputFiles, batchSize, lineByLine), state, LoadOptions.HBASE_CF.get(cli), numThreads); if(sensorEnrichmentUpdateConfig != null) { sensorEnrichmentUpdateConfig.updateSensorConfigs(); diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java index 4ffb91a537..e1769a9ac4 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java @@ -21,39 +21,26 @@ import org.adrianwalker.multilinestring.Multiline; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.PosixParser; -import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.client.HTable; -import org.apache.hadoop.hbase.client.HTableInterface; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.util.GenericOptionsParser; -import org.apache.log4j.PropertyConfigurator; -import org.apache.metron.dataloads.bulk.ThreatIntelBulkLoader; import org.apache.metron.dataloads.extractor.Extractor; import org.apache.metron.dataloads.extractor.ExtractorHandler; -import org.apache.metron.dataloads.extractor.inputformat.WholeFileFormat; -import org.apache.metron.dataloads.nonbulk.flatfile.SimpleEnrichmentFlatFileLoader; import org.apache.metron.dataloads.hbase.mr.HBaseUtil; -import org.apache.metron.enrichment.converter.HbaseConverter; import org.apache.metron.enrichment.converter.EnrichmentConverter; import org.apache.metron.enrichment.converter.EnrichmentKey; import org.apache.metron.enrichment.converter.EnrichmentValue; import org.apache.metron.enrichment.lookup.LookupKV; -import org.apache.metron.common.utils.JSONUtils; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; -import java.io.File; -import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -114,15 +101,15 @@ public void testCommandLine() throws Exception { }; String[] otherArgs = new GenericOptionsParser(conf, argv).getRemainingArgs(); - CommandLine cli = SimpleEnrichmentFlatFileLoader.LoadOptions.parse(new PosixParser(), otherArgs); - Assert.assertEquals(extractorJson,SimpleEnrichmentFlatFileLoader.LoadOptions.EXTRACTOR_CONFIG.get(cli).trim()); - Assert.assertEquals(cf, SimpleEnrichmentFlatFileLoader.LoadOptions.HBASE_CF.get(cli).trim()); - Assert.assertEquals(tableName,SimpleEnrichmentFlatFileLoader.LoadOptions.HBASE_TABLE.get(cli).trim()); - Assert.assertEquals(enrichmentJson,SimpleEnrichmentFlatFileLoader.LoadOptions.ENRICHMENT_CONFIG.get(cli).trim()); - Assert.assertEquals(csvFile,SimpleEnrichmentFlatFileLoader.LoadOptions.INPUT.get(cli).trim()); - Assert.assertEquals(log4jProperty, SimpleEnrichmentFlatFileLoader.LoadOptions.LOG4J_PROPERTIES.get(cli).trim()); - Assert.assertEquals("2", SimpleEnrichmentFlatFileLoader.LoadOptions.NUM_THREADS.get(cli).trim()); - Assert.assertEquals("128", SimpleEnrichmentFlatFileLoader.LoadOptions.BATCH_SIZE.get(cli).trim()); + CommandLine cli = LoadOptions.parse(new PosixParser(), otherArgs); + Assert.assertEquals(extractorJson, LoadOptions.EXTRACTOR_CONFIG.get(cli).trim()); + Assert.assertEquals(cf, LoadOptions.HBASE_CF.get(cli).trim()); + Assert.assertEquals(tableName, LoadOptions.HBASE_TABLE.get(cli).trim()); + Assert.assertEquals(enrichmentJson, LoadOptions.ENRICHMENT_CONFIG.get(cli).trim()); + Assert.assertEquals(csvFile, LoadOptions.INPUT.get(cli).trim()); + Assert.assertEquals(log4jProperty, LoadOptions.LOG4J_PROPERTIES.get(cli).trim()); + Assert.assertEquals("2", LoadOptions.NUM_THREADS.get(cli).trim()); + Assert.assertEquals("128", LoadOptions.BATCH_SIZE.get(cli).trim()); } @Test From 79cfdb4fba5e82e9e170bfc77c7133e6646f9787 Mon Sep 17 00:00:00 2001 From: cstella Date: Tue, 31 Jan 2017 17:12:05 -0500 Subject: [PATCH 14/32] Removing old threatintel_bulk_load.sh script and integrating into the flatfile load script --- .../docker/rpm-docker/SPECS/metron.spec | 1 - .../dataloads/bulk/ThreatIntelBulkLoader.java | 260 ----------------- .../dataloads/extractor/ExtractorHandler.java | 10 +- .../nonbulk/flatfile/ExtractorState.java | 16 +- .../nonbulk/flatfile/LoadOptions.java | 7 +- .../SimpleEnrichmentFlatFileLoader.java | 22 +- .../{ => importer}/ImportStrategy.java | 7 +- .../flatfile/{ => importer}/Importer.java | 3 +- .../{ => importer}/LocalImporter.java | 132 +-------- .../{ => importer}/MapReduceImporter.java | 5 +- .../flatfile/location/FileLocation.java | 40 +++ .../flatfile/location/HDFSLocation.java | 57 ++++ .../nonbulk/flatfile/location/Location.java | 78 +++++ .../flatfile/location/LocationStrategy.java | 50 ++++ .../flatfile/location/RawLocation.java | 17 ++ .../flatfile/location/URLLocation.java | 53 ++++ .../src/main/scripts/threatintel_bulk_load.sh | 41 --- .../mr/BulkLoadMapperIntegrationTest.java | 140 --------- ...richmentFlatFileLoaderIntegrationTest.java | 270 ++++++++++++++++++ .../SimpleEnrichmentFlatFileLoaderTest.java | 151 ---------- 20 files changed, 612 insertions(+), 748 deletions(-) delete mode 100644 metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/bulk/ThreatIntelBulkLoader.java rename metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/{ => importer}/ImportStrategy.java (76%) rename metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/{ => importer}/Importer.java (79%) rename metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/{ => importer}/LocalImporter.java (59%) rename metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/{ => importer}/MapReduceImporter.java (92%) create mode 100644 metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/FileLocation.java create mode 100644 metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/HDFSLocation.java create mode 100644 metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/Location.java create mode 100644 metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/LocationStrategy.java create mode 100644 metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/RawLocation.java create mode 100644 metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/URLLocation.java delete mode 100755 metron-platform/metron-data-management/src/main/scripts/threatintel_bulk_load.sh delete mode 100644 metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/hbase/mr/BulkLoadMapperIntegrationTest.java create mode 100644 metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java delete mode 100644 metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java diff --git a/metron-deployment/packaging/docker/rpm-docker/SPECS/metron.spec b/metron-deployment/packaging/docker/rpm-docker/SPECS/metron.spec index 8a6607b069..20f148ff4e 100644 --- a/metron-deployment/packaging/docker/rpm-docker/SPECS/metron.spec +++ b/metron-deployment/packaging/docker/rpm-docker/SPECS/metron.spec @@ -181,7 +181,6 @@ This package installs the Metron Parser files %{metron_home}/bin/flatfile_loader.sh %{metron_home}/bin/prune_elasticsearch_indices.sh %{metron_home}/bin/prune_hdfs_files.sh -%{metron_home}/bin/threatintel_bulk_load.sh %{metron_home}/bin/threatintel_bulk_prune.sh %{metron_home}/bin/threatintel_taxii_load.sh %attr(0644,root,root) %{metron_home}/lib/metron-data-management-%{full_version}.jar diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/bulk/ThreatIntelBulkLoader.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/bulk/ThreatIntelBulkLoader.java deleted file mode 100644 index 5ba0a91630..0000000000 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/bulk/ThreatIntelBulkLoader.java +++ /dev/null @@ -1,260 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.metron.dataloads.bulk; - -import com.google.common.base.Function; -import com.google.common.base.Joiner; -import com.google.common.io.Files; -import org.apache.commons.cli.*; -import org.apache.commons.cli.ParseException; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.client.Put; -import org.apache.hadoop.hbase.io.ImmutableBytesWritable; -import org.apache.hadoop.hbase.mapreduce.TableOutputFormat; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.util.GenericOptionsParser; -import org.apache.metron.dataloads.extractor.ExtractorHandler; -import org.apache.metron.dataloads.hbase.mr.BulkLoadMapper; -import org.apache.metron.common.configuration.enrichment.SensorEnrichmentUpdateConfig; -import org.apache.metron.enrichment.converter.HbaseConverter; -import org.apache.metron.enrichment.converter.EnrichmentConverter; -import org.apache.metron.common.utils.JSONUtils; - -import javax.annotation.Nullable; -import java.io.File; -import java.io.IOException; -import java.nio.charset.Charset; -import java.text.*; -import java.util.Date; - -public class ThreatIntelBulkLoader { - private static abstract class OptionHandler implements Function {} - public enum BulkLoadOptions { - HELP("h", new OptionHandler() { - - @Nullable - @Override - public Option apply(@Nullable String s) { - return new Option(s, "help", false, "Generate Help screen"); - } - }) - ,TABLE("t", new OptionHandler() { - @Nullable - @Override - public Option apply(@Nullable String s) { - Option o = new Option(s, "table", true, "HBase table to import data into"); - o.setRequired(true); - o.setArgName("HBASE_TABLE"); - return o; - } - }) - ,COLUMN_FAMILY("f", new OptionHandler() { - @Nullable - @Override - public Option apply(@Nullable String s) { - Option o = new Option(s, "column_family", true, "Column family of the HBase table to import into"); - o.setRequired(true); - o.setArgName("CF_NAME"); - return o; - } - }) - ,EXTRACTOR_CONFIG("e", new OptionHandler() { - @Nullable - @Override - public Option apply(@Nullable String s) { - Option o = new Option(s, "extractor_config", true, "JSON Document describing the extractor for this input data source"); - o.setArgName("JSON_FILE"); - o.setRequired(true); - return o; - } - }) - ,INPUT_DATA("i", new OptionHandler() { - @Nullable - @Override - public Option apply(@Nullable String s) { - Option o = new Option(s, "input", true, "Input directory in HDFS for the data to import into HBase"); - o.setArgName("DIR"); - o.setRequired(true); - return o; - } - }) - ,AS_OF_TIME("a", new OptionHandler() { - @Nullable - @Override - public Option apply(@Nullable String s) { - Option o = new Option(s, "as_of", true, "The last read timestamp to mark the records with (omit for time of execution)"); - o.setArgName("datetime"); - o.setRequired(false); - return o; - } - }) - ,AS_OF_TIME_FORMAT("z", new OptionHandler() { - @Nullable - @Override - public Option apply(@Nullable String s) { - Option o = new Option(s, "as_of_format", true, "The format of the as_of time (only used in conjunction with the as_of option)"); - o.setArgName("format"); - o.setRequired(false); - return o; - } - }) - ,CONVERTER("c", new OptionHandler() { - @Nullable - @Override - public Option apply(@Nullable String s) { - Option o = new Option(s, "converter", true, "The HBase converter class to use (Default is threat intel)"); - o.setArgName("class"); - o.setRequired(false); - return o; - } - }) - ,ENRICHMENT_CONFIG("n", new OptionHandler() { - @Nullable - @Override - public Option apply(@Nullable String s) { - Option o = new Option(s, "enrichment_config", true - , "JSON Document describing the enrichment configuration details." + - " This is used to associate an enrichment type with a field type in zookeeper." - ); - o.setArgName("JSON_FILE"); - o.setRequired(false); - return o; - } - }) - ; - Option option; - String shortCode; - BulkLoadOptions(String shortCode, OptionHandler optionHandler) { - this.shortCode = shortCode; - this.option = optionHandler.apply(shortCode); - } - - public boolean has(CommandLine cli) { - return cli.hasOption(shortCode); - } - - public String get(CommandLine cli) { - return cli.getOptionValue(shortCode); - } - - public static CommandLine parse(CommandLineParser parser, String[] args) { - try { - CommandLine cli = parser.parse(getOptions(), args); - if(ThreatIntelBulkLoader.BulkLoadOptions.HELP.has(cli)) { - printHelp(); - System.exit(0); - } - return cli; - } catch (ParseException e) { - System.err.println("Unable to parse args: " + Joiner.on(' ').join(args)); - e.printStackTrace(System.err); - printHelp(); - System.exit(-1); - return null; - } - } - - public static void printHelp() { - HelpFormatter formatter = new HelpFormatter(); - formatter.printHelp( "ThreatIntelBulkLoader", getOptions()); - } - - public static Options getOptions() { - Options ret = new Options(); - for(BulkLoadOptions o : BulkLoadOptions.values()) { - ret.addOption(o.option); - } - return ret; - } - } - - private static long getTimestamp(CommandLine cli) throws java.text.ParseException { - if(BulkLoadOptions.AS_OF_TIME.has(cli)) { - if(!BulkLoadOptions.AS_OF_TIME_FORMAT.has(cli)) { - throw new IllegalStateException("Unable to proceed: Specified as_of_time without an associated format."); - } - else { - DateFormat format = new SimpleDateFormat(BulkLoadOptions.AS_OF_TIME_FORMAT.get(cli)); - Date d = format.parse(BulkLoadOptions.AS_OF_TIME.get(cli)); - return d.getTime(); - } - } - else { - return System.currentTimeMillis(); - } - } - private static String readExtractorConfig(File configFile) throws IOException { - return Joiner.on("\n").join(Files.readLines(configFile, Charset.defaultCharset())); - } - - public static Job createJob(Configuration conf, String input, String table, String cf, String extractorConfigContents, long ts, HbaseConverter converter) throws IOException { - Job job = new Job(conf); - job.setJobName("ThreatIntelBulkLoader: " + input + " => " + table + ":" + cf); - System.out.println("Configuring " + job.getJobName()); - job.setJarByClass(ThreatIntelBulkLoader.class); - job.setMapperClass(org.apache.metron.dataloads.hbase.mr.BulkLoadMapper.class); - job.setOutputFormatClass(TableOutputFormat.class); - job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, table); - job.getConfiguration().set(BulkLoadMapper.COLUMN_FAMILY_KEY, cf); - job.getConfiguration().set(BulkLoadMapper.CONFIG_KEY, extractorConfigContents); - job.getConfiguration().set(BulkLoadMapper.LAST_SEEN_KEY, "" + ts); - job.getConfiguration().set(BulkLoadMapper.CONVERTER_KEY, converter.getClass().getName()); - job.setOutputKeyClass(ImmutableBytesWritable.class); - job.setOutputValueClass(Put.class); - job.setNumReduceTasks(0); - ExtractorHandler handler = ExtractorHandler.load(extractorConfigContents); - handler.getInputFormatHandler().set(job, new Path(input), handler.getConfig()); - return job; - } - - public static void main(String... argv) throws Exception { - Configuration conf = HBaseConfiguration.create(); - String[] otherArgs = new GenericOptionsParser(conf, argv).getRemainingArgs(); - - CommandLine cli = BulkLoadOptions.parse(new PosixParser(), otherArgs); - Long ts = getTimestamp(cli); - String input = BulkLoadOptions.INPUT_DATA.get(cli); - String table = BulkLoadOptions.TABLE.get(cli); - String cf = BulkLoadOptions.COLUMN_FAMILY.get(cli); - String extractorConfigContents = readExtractorConfig(new File(BulkLoadOptions.EXTRACTOR_CONFIG.get(cli))); - String converterClass = EnrichmentConverter.class.getName(); - if(BulkLoadOptions.CONVERTER.has(cli)) { - converterClass = BulkLoadOptions.CONVERTER.get(cli); - } - SensorEnrichmentUpdateConfig sensorEnrichmentUpdateConfig = null; - if(BulkLoadOptions.ENRICHMENT_CONFIG.has(cli)) { - sensorEnrichmentUpdateConfig = JSONUtils.INSTANCE.load( new File(BulkLoadOptions.ENRICHMENT_CONFIG.get(cli)) - , SensorEnrichmentUpdateConfig.class - ); - } - - HbaseConverter converter = (HbaseConverter) Class.forName(converterClass).getConstructor().newInstance(); - Job job = createJob(conf, input, table, cf, extractorConfigContents, ts, converter); - System.out.println(conf); - boolean jobRet = job.waitForCompletion(true); - if(!jobRet) { - System.exit(1); - } - if(sensorEnrichmentUpdateConfig != null) { - sensorEnrichmentUpdateConfig.updateSensorConfigs(); - } - System.exit(0); - } -} diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorHandler.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorHandler.java index 89477d81b2..2e2f799bb4 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorHandler.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorHandler.java @@ -33,7 +33,7 @@ public class ExtractorHandler { final static ObjectMapper _mapper = new ObjectMapper(); private Map config; private Extractor extractor; - private InputFormatHandler inputFormatHandler = Formats.BY_LINE; + private InputFormatHandler inputFormat = Formats.BY_LINE; public Map getConfig() { return config; @@ -43,13 +43,13 @@ public void setConfig(Map config) { this.config = config; } - public InputFormatHandler getInputFormatHandler() { - return inputFormatHandler; + public InputFormatHandler getInputFormat() { + return inputFormat; } - public void setInputFormatHandler(String handler) { + public void setInputFormat(String handler) { try { - this.inputFormatHandler= Formats.create(handler); + this.inputFormat= Formats.create(handler); } catch (ClassNotFoundException | InstantiationException | IllegalAccessException | NoSuchMethodException | InvocationTargetException e) { throw new IllegalStateException("Unable to create an inputformathandler", e); } diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/ExtractorState.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/ExtractorState.java index e44eb27175..168d251da3 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/ExtractorState.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/ExtractorState.java @@ -17,19 +17,29 @@ */ package org.apache.metron.dataloads.nonbulk.flatfile; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hbase.client.HTableInterface; import org.apache.metron.dataloads.extractor.Extractor; import org.apache.metron.enrichment.converter.HbaseConverter; +import java.io.IOException; + public class ExtractorState { private HTableInterface table; private Extractor extractor; private HbaseConverter converter; + private FileSystem fs; - public ExtractorState(HTableInterface table, Extractor extractor, HbaseConverter converter) { + public ExtractorState(HTableInterface table, Extractor extractor, HbaseConverter converter, Configuration config) { this.table = table; this.extractor = extractor; this.converter = converter; + try { + this.fs = FileSystem.get(config); + } catch (IOException e) { + throw new IllegalStateException("Unable to retrieve hadoop file system: " + e.getMessage(), e); + } } public HTableInterface getTable() { @@ -43,4 +53,8 @@ public Extractor getExtractor() { public HbaseConverter getConverter() { return converter; } + + public FileSystem getFileSystem() { + return fs; + } } diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java index c7cf71c78a..612db7054b 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java @@ -5,6 +5,7 @@ import org.apache.commons.cli.*; import org.apache.commons.io.FileUtils; import org.apache.metron.common.utils.ConversionUtils; +import org.apache.metron.dataloads.nonbulk.flatfile.importer.ImportStrategy; import javax.annotation.Nullable; import java.io.File; @@ -54,7 +55,7 @@ public Option apply(@Nullable String s) { @Override public Optional getValue(LoadOptions option, CommandLine cli) { - return Optional.ofNullable(option.get(cli)); + return Optional.ofNullable(option.get(cli).trim()); } }) ,HBASE_CF("c", new OptionHandler() { @@ -69,7 +70,7 @@ public Option apply(@Nullable String s) { @Override public Optional getValue(LoadOptions option, CommandLine cli) { - return Optional.ofNullable(option.get(cli)); + return Optional.ofNullable(option.get(cli).trim()); } }) ,EXTRACTOR_CONFIG("e", new OptionHandler() { @@ -85,7 +86,7 @@ public Option apply(@Nullable String s) { @Override public Optional getValue(LoadOptions option, CommandLine cli) { try { - return Optional.ofNullable(FileUtils.readFileToString(new File(option.get(cli)))); + return Optional.ofNullable(FileUtils.readFileToString(new File(option.get(cli).trim()))); } catch (IOException e) { throw new IllegalStateException("Unable to retrieve extractor config from " + option.get(cli) + ": " + e.getMessage(), e); } diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java index f74d4108d9..a48266e5d7 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java @@ -17,32 +17,19 @@ */ package org.apache.metron.dataloads.nonbulk.flatfile; -import com.google.common.collect.ImmutableList; import org.apache.commons.cli.*; import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.client.HTableInterface; -import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.log4j.PropertyConfigurator; -import org.apache.metron.common.utils.ConversionUtils; -import org.apache.metron.common.utils.file.ReaderSpliterator; -import org.apache.metron.dataloads.extractor.Extractor; import org.apache.metron.dataloads.extractor.ExtractorHandler; -import org.apache.metron.dataloads.extractor.inputformat.WholeFileFormat; import org.apache.metron.common.configuration.enrichment.SensorEnrichmentUpdateConfig; -import org.apache.metron.hbase.HTableProvider; -import org.apache.metron.enrichment.converter.HbaseConverter; -import org.apache.metron.enrichment.converter.EnrichmentConverter; -import org.apache.metron.enrichment.lookup.LookupKV; +import org.apache.metron.dataloads.nonbulk.flatfile.importer.ImportStrategy; import org.apache.metron.common.utils.JSONUtils; import java.io.*; import java.util.*; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ForkJoinPool; -import java.util.stream.Stream; public class SimpleEnrichmentFlatFileLoader { @@ -50,14 +37,17 @@ public class SimpleEnrichmentFlatFileLoader { public static void main(String... argv) throws Exception { Configuration hadoopConfig = HBaseConfiguration.create(); String[] otherArgs = new GenericOptionsParser(hadoopConfig, argv).getRemainingArgs(); + main(hadoopConfig, otherArgs); + } + public static void main(Configuration hadoopConfig, String[] argv) throws Exception { - CommandLine cli = LoadOptions.parse(new PosixParser(), otherArgs); + CommandLine cli = LoadOptions.parse(new PosixParser(), argv); EnumMap> config = LoadOptions.createConfig(cli); if(LoadOptions.LOG4J_PROPERTIES.has(cli)) { PropertyConfigurator.configure(LoadOptions.LOG4J_PROPERTIES.get(cli)); } ExtractorHandler handler = ExtractorHandler.load( - FileUtils.readFileToString(new File(LoadOptions.EXTRACTOR_CONFIG.get(cli))) + FileUtils.readFileToString(new File(LoadOptions.EXTRACTOR_CONFIG.get(cli).trim())) ); ImportStrategy strategy = (ImportStrategy) config.get(LoadOptions.IMPORT_MODE).get(); strategy.getImporter().importData(config, handler, hadoopConfig); diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/ImportStrategy.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/ImportStrategy.java similarity index 76% rename from metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/ImportStrategy.java rename to metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/ImportStrategy.java index 148cfe8c55..2d7062a1a0 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/ImportStrategy.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/ImportStrategy.java @@ -1,8 +1,5 @@ -package org.apache.metron.dataloads.nonbulk.flatfile; +package org.apache.metron.dataloads.nonbulk.flatfile.importer; -import com.google.common.base.Joiner; - -import java.util.List; import java.util.Optional; public enum ImportStrategy { @@ -24,7 +21,7 @@ public static Optional getStrategy(String strategyName) { return Optional.empty(); } for(ImportStrategy strategy : values()) { - if(strategy.name().equalsIgnoreCase(strategyName)) { + if(strategy.name().equalsIgnoreCase(strategyName.trim())) { return Optional.of(strategy); } } diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/Importer.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/Importer.java similarity index 79% rename from metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/Importer.java rename to metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/Importer.java index 05810f44f3..aceb824aa7 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/Importer.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/Importer.java @@ -1,8 +1,9 @@ -package org.apache.metron.dataloads.nonbulk.flatfile; +package org.apache.metron.dataloads.nonbulk.flatfile.importer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.metron.dataloads.extractor.ExtractorHandler; +import org.apache.metron.dataloads.nonbulk.flatfile.LoadOptions; import org.apache.metron.enrichment.converter.EnrichmentConverter; import java.io.IOException; diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LocalImporter.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java similarity index 59% rename from metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LocalImporter.java rename to metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java index e87d63bc93..da5aeb7b0b 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LocalImporter.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java @@ -1,122 +1,31 @@ -package org.apache.metron.dataloads.nonbulk.flatfile; +package org.apache.metron.dataloads.nonbulk.flatfile.importer; -import com.google.common.collect.ImmutableList; -import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.client.HTableInterface; import org.apache.hadoop.hbase.client.Put; import org.apache.metron.common.utils.file.ReaderSpliterator; import org.apache.metron.dataloads.extractor.Extractor; import org.apache.metron.dataloads.extractor.ExtractorHandler; import org.apache.metron.dataloads.extractor.inputformat.WholeFileFormat; +import org.apache.metron.dataloads.nonbulk.flatfile.ExtractorState; +import org.apache.metron.dataloads.nonbulk.flatfile.LoadOptions; +import org.apache.metron.dataloads.nonbulk.flatfile.location.Location; +import org.apache.metron.dataloads.nonbulk.flatfile.location.LocationStrategy; import org.apache.metron.enrichment.converter.EnrichmentConverter; import org.apache.metron.enrichment.converter.HbaseConverter; import org.apache.metron.enrichment.lookup.LookupKV; import org.apache.metron.hbase.HTableProvider; import java.io.*; -import java.nio.file.Files; import java.util.*; import java.util.concurrent.ForkJoinPool; import java.util.function.Consumer; -import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; -import static com.sun.tools.javac.jvm.ByteCodes.ret; - public enum LocalImporter implements Importer { INSTANCE; - private static ThreadLocal fs = new ThreadLocal(){ - - @Override - protected FileSystem initialValue() { - try { - return FileSystem.get(new Configuration()); - } catch (IOException e) { - throw new IllegalStateException("Unable to retrieve the filesystem: " + e.getMessage(), e); - } - } - }; - - /** - * Location can be either a local file or a file on HDFS. - */ - private static final class Location { - private String loc; - private boolean isLocal; - - public Location(String loc) { - this(loc, !loc.startsWith("hdfs://")); - } - public Location(String loc, boolean isLocal) { - this.loc = loc; - this.isLocal = isLocal; - } - - public Optional> getChildren() throws IOException { - if(exists() && isDirectory()) { - List children = new ArrayList<>(); - for(String child : list().orElse(new ArrayList<>())) { - children.add(new Location(child, isLocal)); - } - return Optional.of(children); - } - else { - return Optional.empty(); - } - } - - private Optional> list() throws IOException { - List children = new ArrayList<>(); - if(isLocal) { - for(File f : new File(loc).listFiles()) { - children.add(f.getPath()); - } - } - else { - for(FileStatus f : fs.get().listStatus(new Path(loc)) ) { - children.add(f.getPath().toString()); - } - } - return Optional.of(children); - } - - public boolean exists() throws IOException { - if(isLocal) { - return new File(loc).exists(); - } - else { - return fs.get().exists(new Path(loc)); - } - } - - public boolean isDirectory() throws IOException { - if(isLocal) { - return new File(loc).isDirectory(); - } - else { - return fs.get().isDirectory(new Path(loc)); - } - } - - public BufferedReader openReader() throws IOException { - if(isLocal) { - return new BufferedReader(new FileReader(new File(loc))); - } - else { - return new BufferedReader(new InputStreamReader(fs.get().open(new Path(loc)))); - } - } - - @Override - public String toString() { - return loc; - } - } public interface HTableProviderRetriever { HTableProvider retrieve(); @@ -141,14 +50,14 @@ public void importData( final EnumMap> config protected ExtractorState initialValue() { try { HTableInterface table = provider.retrieve().getTable(hadoopConfig, (String) config.get(LoadOptions.HBASE_TABLE).get()); - return new ExtractorState(table, handler.getExtractor(), new EnrichmentConverter()); + return new ExtractorState(table, handler.getExtractor(), new EnrichmentConverter(), hadoopConfig); } catch (IOException e1) { throw new IllegalStateException("Unable to get table: " + e1); } } }; - boolean lineByLine = !handler.getInputFormatHandler().getClass().equals(WholeFileFormat.class); + boolean lineByLine = !handler.getInputFormat().getClass().equals(WholeFileFormat.class); List inputs = (List) config.get(LoadOptions.INPUT).get(); String cf = (String) config.get(LoadOptions.HBASE_CF).get(); if(!lineByLine) { @@ -168,7 +77,7 @@ public void extractLineByLine( List inputs , int batchSize , int numThreads ) throws IOException { - inputs.stream().map(input -> new Location(input)) + inputs.stream().map(input -> LocationStrategy.getLocation(input, state.get().getFileSystem())) .forEach( loc -> { try (Stream stream = ReaderSpliterator.lineStream(loc.openReader(), batchSize)) { @@ -193,7 +102,7 @@ public void extractLineByLine( List inputs public void extractWholeFiles( List inputs, ThreadLocal state, String cf) throws IOException { final List locations = new ArrayList<>(); - fileVisitor(inputs, loc -> locations.add(loc)); + Location.fileVisitor(inputs, loc -> locations.add(loc), state.get().getFileSystem()); locations.parallelStream().forEach(loc -> { try(BufferedReader br = loc.openReader()) { String s = br.lines().collect(Collectors.joining()); @@ -221,27 +130,6 @@ public List extract(String line } - public void fileVisitor(List inputs - , final Consumer importConsumer - ) throws IOException { - Stack stack = new Stack<>(); - for(String input : inputs) { - Location loc = new Location(input); - if(loc.exists()) { - stack.add(loc); - } - } - while(!stack.empty()) { - Location loc = stack.pop(); - if(loc.isDirectory()) { - for(Location child : loc.getChildren().orElse(Collections.emptyList())) { - stack.push(child); - } - } - else { - importConsumer.accept(loc); - } - } - } + } diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/MapReduceImporter.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/MapReduceImporter.java similarity index 92% rename from metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/MapReduceImporter.java rename to metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/MapReduceImporter.java index b9bcc49a3c..d83a359897 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/MapReduceImporter.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/MapReduceImporter.java @@ -1,4 +1,4 @@ -package org.apache.metron.dataloads.nonbulk.flatfile; +package org.apache.metron.dataloads.nonbulk.flatfile.importer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; @@ -8,6 +8,7 @@ import org.apache.hadoop.mapreduce.Job; import org.apache.metron.dataloads.extractor.ExtractorHandler; import org.apache.metron.dataloads.hbase.mr.BulkLoadMapper; +import org.apache.metron.dataloads.nonbulk.flatfile.LoadOptions; import org.apache.metron.enrichment.converter.EnrichmentConverter; import java.io.IOException; @@ -44,7 +45,7 @@ public void importData(EnumMap> config job.setOutputValueClass(Put.class); job.setNumReduceTasks(0); List paths = inputs.stream().map(p -> new Path(p)).collect(Collectors.toList()); - handler.getInputFormatHandler().set(job, paths, handler.getConfig()); + handler.getInputFormat().set(job, paths, handler.getConfig()); try { job.waitForCompletion(true); } catch (Exception e) { diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/FileLocation.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/FileLocation.java new file mode 100644 index 0000000000..9713e7cd69 --- /dev/null +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/FileLocation.java @@ -0,0 +1,40 @@ +package org.apache.metron.dataloads.nonbulk.flatfile.location; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +public class FileLocation implements RawLocation { + @Override + public Optional> list(String loc) { + List children = new ArrayList<>(); + for(File f : new File(loc).listFiles()) { + children.add(f.getPath()); + } + return Optional.of(children); + } + + @Override + public boolean exists(String loc) throws IOException { + return new File(loc).exists(); + } + + @Override + public boolean isDirectory(String loc) throws IOException { + return new File(loc).isDirectory(); + } + + @Override + public BufferedReader openReader(String loc) throws IOException { + return new BufferedReader(new FileReader(loc)); + } + + @Override + public boolean match(String loc) { + return new File(loc).exists(); + } +} diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/HDFSLocation.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/HDFSLocation.java new file mode 100644 index 0000000000..c58356077b --- /dev/null +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/HDFSLocation.java @@ -0,0 +1,57 @@ +package org.apache.metron.dataloads.nonbulk.flatfile.location; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +public class HDFSLocation implements RawLocation { + + FileSystem fs = null; + + @Override + public Optional> list(String loc) throws IOException { + List children = new ArrayList<>(); + for(FileStatus f : fs.listStatus(new Path(loc)) ) { + children.add(f.getPath().toString()); + } + return Optional.of(children); + } + + @Override + public boolean exists(String loc) throws IOException { + return fs.exists(new Path(loc)); + } + + @Override + public boolean isDirectory(String loc) throws IOException { + return fs.isDirectory(new Path(loc)); + } + + @Override + public BufferedReader openReader(String loc) throws IOException { + return new BufferedReader(new InputStreamReader(fs.open(new Path(loc)))); + } + + @Override + public boolean match(String loc) { + try { + return loc.startsWith("hdfs://") && exists(loc); + } catch (IOException e) { + return false; + } + } + + @Override + public void init(FileSystem state) { + this.fs = state; + } + + +} diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/Location.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/Location.java new file mode 100644 index 0000000000..137e70224e --- /dev/null +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/Location.java @@ -0,0 +1,78 @@ +package org.apache.metron.dataloads.nonbulk.flatfile.location; + +import org.apache.hadoop.fs.*; +import org.apache.metron.dataloads.nonbulk.flatfile.importer.LocalImporter; + +import java.io.*; +import java.util.*; +import java.util.function.Consumer; + +/** + * Location can be either a local file or a file on HDFS. + */ +public class Location { + + private String loc; + private RawLocation rawLocation; + + public Location(String loc, RawLocation rawLocation) { + this.loc = loc; + this.rawLocation = rawLocation; + + } + + public Optional> getChildren() throws IOException { + if(exists() && isDirectory()) { + List children = new ArrayList<>(); + for(String child : rawLocation.list(loc).orElse(new ArrayList<>())) { + children.add(new Location(child, rawLocation)); + } + return Optional.of(children); + } + else { + return Optional.empty(); + } + } + + + public boolean exists() throws IOException { + return rawLocation.exists(loc); + } + + public boolean isDirectory() throws IOException { + return rawLocation.isDirectory(loc); + } + + public BufferedReader openReader() throws IOException { + return rawLocation.openReader(loc); + } + + @Override + public String toString() { + return loc; + } + + public static void fileVisitor(List inputs + , final Consumer importConsumer + , final FileSystem fs + ) throws IOException { + Stack stack = new Stack<>(); + for(String input : inputs) { + Location loc = LocationStrategy.getLocation(input, fs); + if(loc.exists()) { + stack.add(loc); + } + } + while(!stack.empty()) { + Location loc = stack.pop(); + if(loc.isDirectory()) { + for(Location child : loc.getChildren().orElse(Collections.emptyList())) { + stack.push(child); + } + } + else { + importConsumer.accept(loc); + } + } + } +} diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/LocationStrategy.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/LocationStrategy.java new file mode 100644 index 0000000000..a3f3e78345 --- /dev/null +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/LocationStrategy.java @@ -0,0 +1,50 @@ +package org.apache.metron.dataloads.nonbulk.flatfile.location; + +import org.apache.hadoop.fs.FileSystem; + +import java.util.Optional; +import java.util.function.Function; + +public enum LocationStrategy { + HDFS(fs -> { + HDFSLocation location = new HDFSLocation(); + location.init(fs); + return location; + }) + ,FILE(fs -> { + FileLocation location = new FileLocation(); + location.init(fs); + return location; + }) + ,URL(fs -> { + URLLocation location = new URLLocation(); + location.init(fs); + return location; + }) + ; + Function> locationCreator; + + LocationStrategy(Function> locationCreator) { + this.locationCreator = locationCreator; + } + + public static Optional> getRawLocation(String loc, FileSystem fs) { + for(LocationStrategy strategy : values()) { + RawLocation location = strategy.locationCreator.apply(fs); + if(location.match(loc)) { + return Optional.of(location); + } + } + return Optional.empty(); + } + + public static Location getLocation(String loc, FileSystem fs) { + Optional> rawLoc = getRawLocation(loc, fs); + if(rawLoc.isPresent()) { + return new Location(loc, rawLoc.get()); + } + else { + throw new IllegalStateException("Unsupported type: " + loc); + } + } +} diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/RawLocation.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/RawLocation.java new file mode 100644 index 0000000000..af01361ef8 --- /dev/null +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/RawLocation.java @@ -0,0 +1,17 @@ +package org.apache.metron.dataloads.nonbulk.flatfile.location; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.List; +import java.util.Optional; + +public interface RawLocation { + Optional> list(String loc) throws IOException; + boolean exists(String loc) throws IOException; + boolean isDirectory(String loc) throws IOException; + BufferedReader openReader(String loc) throws IOException; + boolean match(String loc); + default void init(T state) { + + } +} diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/URLLocation.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/URLLocation.java new file mode 100644 index 0000000000..a5e83fa8bf --- /dev/null +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/URLLocation.java @@ -0,0 +1,53 @@ +package org.apache.metron.dataloads.nonbulk.flatfile.location; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Collections; +import java.util.List; +import java.util.Optional; +import java.util.zip.GZIPInputStream; +import java.util.zip.ZipInputStream; + +public class URLLocation implements RawLocation { + + @Override + public Optional> list(String loc) throws IOException { + return Optional.of(Collections.emptyList()); + } + + @Override + public boolean exists(String loc) throws IOException { + return true; + } + + @Override + public boolean isDirectory(String loc) throws IOException { + return false; + } + + @Override + public BufferedReader openReader(String loc) throws IOException { + InputStream is = new URL(loc).openStream(); + if(loc.endsWith(".zip")) { + is = new ZipInputStream(is); + } + else if(loc.endsWith(".gz")) { + is = new GZIPInputStream(is); + } + return new BufferedReader(new InputStreamReader(is)); + } + + @Override + public boolean match(String loc) { + try { + new URL(loc); + return true; + } catch (MalformedURLException e) { + return false; + } + } +} diff --git a/metron-platform/metron-data-management/src/main/scripts/threatintel_bulk_load.sh b/metron-platform/metron-data-management/src/main/scripts/threatintel_bulk_load.sh deleted file mode 100755 index 865d0ada35..0000000000 --- a/metron-platform/metron-data-management/src/main/scripts/threatintel_bulk_load.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -BIGTOP_DEFAULTS_DIR=${BIGTOP_DEFAULTS_DIR-/etc/default} -[ -n "${BIGTOP_DEFAULTS_DIR}" -a -r ${BIGTOP_DEFAULTS_DIR}/hbase ] && . ${BIGTOP_DEFAULTS_DIR}/hbase - -# Autodetect JAVA_HOME if not defined -if [ -e /usr/libexec/bigtop-detect-javahome ]; then - . /usr/libexec/bigtop-detect-javahome -elif [ -e /usr/lib/bigtop-utils/bigtop-detect-javahome ]; then - . /usr/lib/bigtop-utils/bigtop-detect-javahome -fi - -export HBASE_HOME=${HBASE_HOME:-/usr/hdp/current/hbase-client} -HADOOP_CLASSPATH=${HBASE_HOME}/lib/hbase-server.jar:`${HBASE_HOME}/bin/hbase classpath` -for jar in $(echo $HADOOP_CLASSPATH | sed 's/:/ /g');do - if [ -f $jar ];then - LIBJARS="$jar,$LIBJARS" - fi -done -export HADOOP_CLASSPATH -export METRON_VERSION=${project.version} -export METRON_HOME=/usr/metron/$METRON_VERSION -export DM_JAR=${project.artifactId}-$METRON_VERSION.jar -hadoop jar $METRON_HOME/lib/$DM_JAR org.apache.metron.dataloads.bulk.ThreatIntelBulkLoader -libjars ${LIBJARS} "$@" diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/hbase/mr/BulkLoadMapperIntegrationTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/hbase/mr/BulkLoadMapperIntegrationTest.java deleted file mode 100644 index b7a753ba5d..0000000000 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/hbase/mr/BulkLoadMapperIntegrationTest.java +++ /dev/null @@ -1,140 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.metron.dataloads.hbase.mr; - -import com.sun.jersey.guice.spi.container.GuiceComponentProviderFactory; -import org.adrianwalker.multilinestring.Multiline; -import org.apache.commons.cli.CommandLine; -import org.apache.commons.cli.PosixParser; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.HBaseTestingUtility; -import org.apache.hadoop.hbase.client.HTable; -import org.apache.hadoop.hbase.client.Result; -import org.apache.hadoop.hbase.client.ResultScanner; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.util.GenericOptionsParser; -import org.apache.metron.dataloads.bulk.ThreatIntelBulkLoader; -import org.apache.metron.enrichment.converter.EnrichmentConverter; -import org.apache.metron.enrichment.converter.EnrichmentKey; -import org.apache.metron.enrichment.converter.EnrichmentValue; -import org.apache.metron.enrichment.lookup.LookupKV; -import org.apache.metron.test.utils.UnitTestHelper; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import java.io.*; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.logging.Level; - -public class BulkLoadMapperIntegrationTest { - /** The test util. */ - private HBaseTestingUtility testUtil; - - /** The test table. */ - private HTable testTable; - private String tableName = "malicious_domains"; - private String cf = "cf"; - private String csvFile="input.csv"; - private String extractorJson = "extractor.json"; - private String enrichmentJson = "enrichment_config.json"; - private String asOf = "04/15/2016"; - private String asOfFormat = "georgia"; - private String convertClass = "threadIntel.class"; - private Configuration config = null; - - - @Before - public void setup() throws Exception { - UnitTestHelper.setJavaLoggingLevel(Level.SEVERE); - Map.Entry kv = HBaseUtil.INSTANCE.create(true); - config = kv.getValue(); - testUtil = kv.getKey(); - testTable = testUtil.createTable(Bytes.toBytes(tableName), Bytes.toBytes(cf)); - } - - @After - public void teardown() throws Exception { - HBaseUtil.INSTANCE.teardown(testUtil); - } - /** - { - "config" : { - "columns" : { - "host" : 0 - ,"meta" : 2 - } - ,"indicator_column" : "host" - ,"separator" : "," - ,"type" : "threat" - } - ,"extractor" : "CSV" - } - */ - @Multiline - private static String extractorConfig; - - @Test - public void testCommandLine() throws Exception { - UnitTestHelper.setJavaLoggingLevel(GuiceComponentProviderFactory.class, Level.WARNING); - Configuration conf = HBaseConfiguration.create(); - - String[] argv = {"-f cf", "-t malicious_domains", "-e extractor.json", "-n enrichment_config.json", "-a 04/15/2016", "-i input.csv", "-z georgia", "-c threadIntel.class"}; - String[] otherArgs = new GenericOptionsParser(conf, argv).getRemainingArgs(); - - CommandLine cli = ThreatIntelBulkLoader.BulkLoadOptions.parse(new PosixParser(), otherArgs); - Assert.assertEquals(extractorJson,ThreatIntelBulkLoader.BulkLoadOptions.EXTRACTOR_CONFIG.get(cli).trim()); - Assert.assertEquals(cf, ThreatIntelBulkLoader.BulkLoadOptions.COLUMN_FAMILY.get(cli).trim()); - Assert.assertEquals(tableName,ThreatIntelBulkLoader.BulkLoadOptions.TABLE.get(cli).trim()); - Assert.assertEquals(enrichmentJson,ThreatIntelBulkLoader.BulkLoadOptions.ENRICHMENT_CONFIG.get(cli).trim()); - Assert.assertEquals(csvFile,ThreatIntelBulkLoader.BulkLoadOptions.INPUT_DATA.get(cli).trim()); - Assert.assertEquals(asOf, ThreatIntelBulkLoader.BulkLoadOptions.AS_OF_TIME.get(cli).trim()); - Assert.assertEquals(asOfFormat, ThreatIntelBulkLoader.BulkLoadOptions.AS_OF_TIME_FORMAT.get(cli).trim()); - Assert.assertEquals(convertClass, ThreatIntelBulkLoader.BulkLoadOptions.CONVERTER.get(cli).trim()); - } - - @Test - public void test() throws IOException, ClassNotFoundException, InterruptedException { - - Assert.assertNotNull(testTable); - FileSystem fs = FileSystem.get(config); - String contents = "google.com,1,foo"; - EnrichmentConverter converter = new EnrichmentConverter(); - HBaseUtil.INSTANCE.writeFile(contents, new Path("input.csv"), fs); - Job job = ThreatIntelBulkLoader.createJob(config, "input.csv", tableName, cf, extractorConfig, 0L, new EnrichmentConverter()); - Assert.assertTrue(job.waitForCompletion(true)); - ResultScanner scanner = testTable.getScanner(Bytes.toBytes(cf)); - List> results = new ArrayList<>(); - for(Result r : scanner) { - results.add(converter.fromResult(r, cf)); - } - Assert.assertEquals(1, results.size()); - Assert.assertEquals(results.get(0).getKey().indicator, "google.com"); - Assert.assertEquals(results.get(0).getKey().type, "threat"); - Assert.assertEquals(results.get(0).getValue().getMetadata().size(), 2); - Assert.assertEquals(results.get(0).getValue().getMetadata().get("meta"), "foo"); - Assert.assertEquals(results.get(0).getValue().getMetadata().get("host"), "google.com"); - } -} diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java new file mode 100644 index 0000000000..0f79220c82 --- /dev/null +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java @@ -0,0 +1,270 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.metron.dataloads.nonbulk.flatfile; + +import com.google.common.collect.ImmutableList; +import org.adrianwalker.multilinestring.Multiline; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.PosixParser; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.client.Delete; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.metron.dataloads.extractor.Extractor; +import org.apache.metron.dataloads.extractor.ExtractorHandler; +import org.apache.metron.dataloads.hbase.mr.HBaseUtil; +import org.apache.metron.enrichment.converter.EnrichmentConverter; +import org.apache.metron.enrichment.converter.EnrichmentKey; +import org.apache.metron.enrichment.converter.EnrichmentValue; +import org.apache.metron.enrichment.lookup.LookupKV; +import org.apache.metron.test.utils.UnitTestHelper; +import org.junit.*; + +import java.io.File; +import java.io.PrintWriter; +import java.nio.file.Files; +import java.nio.file.OpenOption; +import java.nio.file.StandardOpenOption; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.logging.Level; +import java.util.stream.Stream; + +public class SimpleEnrichmentFlatFileLoaderIntegrationTest { + + private static HBaseTestingUtility testUtil; + + /** The test table. */ + private static HTable testTable; + private static Configuration config = null; + private static final String tableName = "enrichment"; + private static final String cf = "cf"; + private static final String csvFile="input.csv"; + private static final String extractorJson = "extractor.json"; + private static final String enrichmentJson = "enrichment_config.json"; + private static final String log4jProperty = "log4j"; + private static final File file1 = new File("target/sefflt_data_1.csv"); + private static final File file2 = new File("target/sefflt_data_2.csv"); + private static final File multilineFile= new File("target/sefflt_data_2.csv"); + private static final File lineByLineExtractorConfigFile = new File("target/sefflt_extractorConfig_lbl.json"); + private static final File wholeFileExtractorConfigFile = new File("target/sefflt_extractorConfig_wf.json"); + private static final int NUM_LINES = 1000; + + /** + { + "config" : { + "columns" : { + "host" : 0, + "meta" : 2 + }, + "indicator_column" : "host", + "separator" : ",", + "type" : "enrichment" + }, + "extractor" : "CSV" + } + */ + @Multiline + private static String lineByLineExtractorConfig; + + /** + { + "config" : { + "columns" : { + "host" : 0, + "meta" : 2 + }, + "indicator_column" : "host", + "separator" : ",", + "type" : "enrichment" + }, + "extractor" : "CSV", + "inputFormat" : "WHOLE_FILE" + } + */ + @Multiline + private static String wholeFileExtractorConfig; + + @BeforeClass + public static void setup() throws Exception { + UnitTestHelper.setJavaLoggingLevel(Level.SEVERE); + Map.Entry kv = HBaseUtil.INSTANCE.create(true); + config = kv.getValue(); + testUtil = kv.getKey(); + testTable = testUtil.createTable(Bytes.toBytes(tableName), Bytes.toBytes(cf)); + + for(Result r : testTable.getScanner(Bytes.toBytes(cf))) { + Delete d = new Delete(r.getRow()); + testTable.delete(d); + } + + if(lineByLineExtractorConfigFile.exists()) { + lineByLineExtractorConfigFile.delete(); + } + Files.write( lineByLineExtractorConfigFile.toPath() + , lineByLineExtractorConfig.getBytes() + , StandardOpenOption.CREATE_NEW , StandardOpenOption.TRUNCATE_EXISTING + ); + if(wholeFileExtractorConfigFile.exists()) { + wholeFileExtractorConfigFile.delete(); + } + Files.write( wholeFileExtractorConfigFile.toPath() + , wholeFileExtractorConfig.getBytes() + , StandardOpenOption.CREATE_NEW , StandardOpenOption.TRUNCATE_EXISTING + ); + if(file1.exists()) { + file1.delete(); + } + Files.write( file1.toPath() + , "google1.com,1,foo2\n".getBytes() + , StandardOpenOption.CREATE_NEW , StandardOpenOption.TRUNCATE_EXISTING + ); + if(file2.exists()) { + file2.delete(); + } + Files.write( file2.toPath() + , "google2.com,2,foo2\n".getBytes() + , StandardOpenOption.CREATE_NEW , StandardOpenOption.TRUNCATE_EXISTING + ); + + if(multilineFile.exists()) { + multilineFile.delete(); + } + try(PrintWriter pw = new PrintWriter(multilineFile)) { + for(int i = 0;i < NUM_LINES;++i) { + pw.println("google" + i + ".com," + i + ",foo" + i); + } + } + + } + + @AfterClass + public static void teardown() throws Exception { + HBaseUtil.INSTANCE.teardown(testUtil); + file1.delete(); + file2.delete(); + multilineFile.delete(); + lineByLineExtractorConfigFile.delete(); + wholeFileExtractorConfigFile.delete(); + } + + + @Test + public void testArgs() throws Exception { + String[] argv = {"-c cf", "-t enrichment" + , "-e extractor.json", "-n enrichment_config.json" + , "-l log4j", "-i input.csv" + , "-p 2", "-b 128" + }; + + String[] otherArgs = new GenericOptionsParser(config, argv).getRemainingArgs(); + + CommandLine cli = LoadOptions.parse(new PosixParser(), otherArgs); + Assert.assertEquals(extractorJson, LoadOptions.EXTRACTOR_CONFIG.get(cli).trim()); + Assert.assertEquals(cf, LoadOptions.HBASE_CF.get(cli).trim()); + Assert.assertEquals(tableName, LoadOptions.HBASE_TABLE.get(cli).trim()); + Assert.assertEquals(enrichmentJson, LoadOptions.ENRICHMENT_CONFIG.get(cli).trim()); + Assert.assertEquals(csvFile, LoadOptions.INPUT.get(cli).trim()); + Assert.assertEquals(log4jProperty, LoadOptions.LOG4J_PROPERTIES.get(cli).trim()); + Assert.assertEquals("2", LoadOptions.NUM_THREADS.get(cli).trim()); + Assert.assertEquals("128", LoadOptions.BATCH_SIZE.get(cli).trim()); + } + + @Test + public void testLocalLineByLine() throws Exception { + String[] argv = {"-c cf", "-t enrichment" + , "-e " + lineByLineExtractorConfigFile.getPath() + , "-i " + multilineFile.getPath() + , "-p 2", "-b 128" + }; + SimpleEnrichmentFlatFileLoader.main(config, argv); + EnrichmentConverter converter = new EnrichmentConverter(); + ResultScanner scanner = testTable.getScanner(Bytes.toBytes(cf)); + List> results = new ArrayList<>(); + for (Result r : scanner) { + results.add(converter.fromResult(r, cf)); + testTable.delete(new Delete(r.getRow())); + } + Assert.assertEquals(NUM_LINES, results.size()); + Assert.assertTrue(results.get(0).getKey().indicator.startsWith("google")); + Assert.assertEquals(results.get(0).getKey().type, "enrichment"); + Assert.assertEquals(results.get(0).getValue().getMetadata().size(), 2); + Assert.assertTrue(results.get(0).getValue().getMetadata().get("meta").toString().startsWith("foo")); + Assert.assertTrue(results.get(0).getValue().getMetadata().get("host").toString().startsWith("google")); + + } + + @Test + public void testLocalWholeFile() throws Exception { + String[] argv = { "-c cf", "-t enrichment" + , "-e " + wholeFileExtractorConfigFile.getPath() + , "-i " + file1.getPath() + "," + file2.getPath() + , "-p 2", "-b 128" + }; + SimpleEnrichmentFlatFileLoader.main(config, argv); + EnrichmentConverter converter = new EnrichmentConverter(); + ResultScanner scanner = testTable.getScanner(Bytes.toBytes(cf)); + List> results = new ArrayList<>(); + for(Result r : scanner) { + results.add(converter.fromResult(r, cf)); + testTable.delete(new Delete(r.getRow())); + } + Assert.assertEquals(2, results.size()); + Assert.assertTrue(results.get(0).getKey().indicator.startsWith("google")); + Assert.assertEquals(results.get(0).getKey().type, "enrichment"); + Assert.assertEquals(results.get(0).getValue().getMetadata().size(), 2); + Assert.assertTrue(results.get(0).getValue().getMetadata().get("meta").toString().startsWith("foo")); + Assert.assertTrue(results.get(0).getValue().getMetadata().get("host").toString().startsWith( "google")); + + } + + @Test + public void testMRLineByLine() throws Exception { + String[] argv = {"-c cf", "-t enrichment" + , "-e " + lineByLineExtractorConfigFile.getPath() + , "-i " + multilineFile.getName() + , "-m MR" + , "-p 2", "-b 128" + }; + FileSystem fs = FileSystem.get(config); + HBaseUtil.INSTANCE.writeFile(new String(Files.readAllBytes(multilineFile.toPath())), new Path(multilineFile.getName()), fs); + SimpleEnrichmentFlatFileLoader.main(config, argv); + EnrichmentConverter converter = new EnrichmentConverter(); + ResultScanner scanner = testTable.getScanner(Bytes.toBytes(cf)); + List> results = new ArrayList<>(); + for (Result r : scanner) { + results.add(converter.fromResult(r, cf)); + testTable.delete(new Delete(r.getRow())); + } + Assert.assertEquals(NUM_LINES, results.size()); + Assert.assertTrue(results.get(0).getKey().indicator.startsWith("google")); + Assert.assertEquals(results.get(0).getKey().type, "enrichment"); + Assert.assertEquals(results.get(0).getValue().getMetadata().size(), 2); + Assert.assertTrue(results.get(0).getValue().getMetadata().get("meta").toString().startsWith("foo")); + Assert.assertTrue(results.get(0).getValue().getMetadata().get("host").toString().startsWith("google")); + } + +} diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java deleted file mode 100644 index e1769a9ac4..0000000000 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java +++ /dev/null @@ -1,151 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.metron.dataloads.nonbulk.flatfile; - -import com.google.common.collect.ImmutableList; -import org.adrianwalker.multilinestring.Multiline; -import org.apache.commons.cli.CommandLine; -import org.apache.commons.cli.PosixParser; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.HBaseTestingUtility; -import org.apache.hadoop.hbase.client.HTable; -import org.apache.hadoop.hbase.client.Result; -import org.apache.hadoop.hbase.client.ResultScanner; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.util.GenericOptionsParser; -import org.apache.metron.dataloads.extractor.Extractor; -import org.apache.metron.dataloads.extractor.ExtractorHandler; -import org.apache.metron.dataloads.hbase.mr.HBaseUtil; -import org.apache.metron.enrichment.converter.EnrichmentConverter; -import org.apache.metron.enrichment.converter.EnrichmentKey; -import org.apache.metron.enrichment.converter.EnrichmentValue; -import org.apache.metron.enrichment.lookup.LookupKV; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.stream.Stream; - -public class SimpleEnrichmentFlatFileLoaderTest { - - private HBaseTestingUtility testUtil; - - /** The test table. */ - private HTable testTable; - private String tableName = "enrichment"; - private String cf = "cf"; - private String csvFile="input.csv"; - private String extractorJson = "extractor.json"; - private String enrichmentJson = "enrichment_config.json"; - private String log4jProperty = "log4j"; - - Configuration config = null; - /** - { - "config" : { - "columns" : { - "host" : 0, - "meta" : 2 - }, - "indicator_column" : "host", - "separator" : ",", - "type" : "enrichment" - }, - "extractor" : "CSV" - } - */ - @Multiline - private static String extractorConfig; - - @Before - public void setup() throws Exception { - Map.Entry kv = HBaseUtil.INSTANCE.create(true); - config = kv.getValue(); - testUtil = kv.getKey(); - testTable = testUtil.createTable(Bytes.toBytes(tableName), Bytes.toBytes(cf)); - } - - @After - public void teardown() throws Exception { - HBaseUtil.INSTANCE.teardown(testUtil); - } - - @Test - public void testCommandLine() throws Exception { - Configuration conf = HBaseConfiguration.create(); - - String[] argv = { "-c cf", "-t enrichment" - , "-e extractor.json", "-n enrichment_config.json" - , "-l log4j", "-i input.csv" - , "-p 2", "-b 128" - }; - String[] otherArgs = new GenericOptionsParser(conf, argv).getRemainingArgs(); - - CommandLine cli = LoadOptions.parse(new PosixParser(), otherArgs); - Assert.assertEquals(extractorJson, LoadOptions.EXTRACTOR_CONFIG.get(cli).trim()); - Assert.assertEquals(cf, LoadOptions.HBASE_CF.get(cli).trim()); - Assert.assertEquals(tableName, LoadOptions.HBASE_TABLE.get(cli).trim()); - Assert.assertEquals(enrichmentJson, LoadOptions.ENRICHMENT_CONFIG.get(cli).trim()); - Assert.assertEquals(csvFile, LoadOptions.INPUT.get(cli).trim()); - Assert.assertEquals(log4jProperty, LoadOptions.LOG4J_PROPERTIES.get(cli).trim()); - Assert.assertEquals("2", LoadOptions.NUM_THREADS.get(cli).trim()); - Assert.assertEquals("128", LoadOptions.BATCH_SIZE.get(cli).trim()); - } - - @Test - public void test() throws Exception { - - Assert.assertNotNull(testTable); - String contents = "google.com,1,foo"; - - EnrichmentConverter converter = new EnrichmentConverter(); - ExtractorHandler handler = ExtractorHandler.load(extractorConfig); - Extractor e = handler.getExtractor(); - SimpleEnrichmentFlatFileLoader loader = new SimpleEnrichmentFlatFileLoader(); - Stream contentStreams = ImmutableList.of(contents).stream(); - ThreadLocal state = new ThreadLocal() { - @Override - protected ExtractorState initialValue() { - return new ExtractorState(testTable, e, converter); - } - }; - loader.load(ImmutableList.of(contentStreams) - , state - , cf - , 2 - ); - - ResultScanner scanner = testTable.getScanner(Bytes.toBytes(cf)); - List> results = new ArrayList<>(); - for(Result r : scanner) { - results.add(converter.fromResult(r, cf)); - } - Assert.assertEquals(1, results.size()); - Assert.assertEquals(results.get(0).getKey().indicator, "google.com"); - Assert.assertEquals(results.get(0).getKey().type, "enrichment"); - Assert.assertEquals(results.get(0).getValue().getMetadata().size(), 2); - Assert.assertEquals(results.get(0).getValue().getMetadata().get("meta"), "foo"); - Assert.assertEquals(results.get(0).getValue().getMetadata().get("host"), "google.com"); - } - -} From bf7756b52e66907ca23a576ba9be9ab40b33f77d Mon Sep 17 00:00:00 2001 From: cstella Date: Tue, 31 Jan 2017 17:22:17 -0500 Subject: [PATCH 15/32] Forgot licenses. --- .../nonbulk/flatfile/LoadOptions.java | 17 +++++++++++++++++ .../nonbulk/flatfile/OptionHandler.java | 17 +++++++++++++++++ .../flatfile/importer/ImportStrategy.java | 17 +++++++++++++++++ .../nonbulk/flatfile/importer/Importer.java | 18 ++++++++++++++++++ .../flatfile/importer/LocalImporter.java | 17 +++++++++++++++++ .../flatfile/importer/MapReduceImporter.java | 17 +++++++++++++++++ .../flatfile/location/FileLocation.java | 17 +++++++++++++++++ .../flatfile/location/HDFSLocation.java | 17 +++++++++++++++++ .../nonbulk/flatfile/location/Location.java | 17 +++++++++++++++++ .../flatfile/location/LocationStrategy.java | 17 +++++++++++++++++ .../nonbulk/flatfile/location/RawLocation.java | 17 +++++++++++++++++ .../nonbulk/flatfile/location/URLLocation.java | 17 +++++++++++++++++ 12 files changed, 205 insertions(+) diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java index 612db7054b..e418da6adf 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.metron.dataloads.nonbulk.flatfile; import com.google.common.base.Joiner; diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/OptionHandler.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/OptionHandler.java index 9b6ef6d201..30620b10d9 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/OptionHandler.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/OptionHandler.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.metron.dataloads.nonbulk.flatfile; import com.google.common.base.Function; diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/ImportStrategy.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/ImportStrategy.java index 2d7062a1a0..df88640335 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/ImportStrategy.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/ImportStrategy.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.metron.dataloads.nonbulk.flatfile.importer; import java.util.Optional; diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/Importer.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/Importer.java index aceb824aa7..81ede088e1 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/Importer.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/Importer.java @@ -1,3 +1,21 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.metron.dataloads.nonbulk.flatfile.importer; import org.apache.hadoop.conf.Configuration; diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java index da5aeb7b0b..45755130c5 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.metron.dataloads.nonbulk.flatfile.importer; import org.apache.hadoop.conf.Configuration; diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/MapReduceImporter.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/MapReduceImporter.java index d83a359897..6929a8c396 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/MapReduceImporter.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/MapReduceImporter.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.metron.dataloads.nonbulk.flatfile.importer; import org.apache.hadoop.conf.Configuration; diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/FileLocation.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/FileLocation.java index 9713e7cd69..d7121cfee1 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/FileLocation.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/FileLocation.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.metron.dataloads.nonbulk.flatfile.location; import java.io.BufferedReader; diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/HDFSLocation.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/HDFSLocation.java index c58356077b..67140fb57f 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/HDFSLocation.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/HDFSLocation.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.metron.dataloads.nonbulk.flatfile.location; import org.apache.hadoop.fs.FileStatus; diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/Location.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/Location.java index 137e70224e..ab7ddb521e 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/Location.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/Location.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.metron.dataloads.nonbulk.flatfile.location; import org.apache.hadoop.fs.*; diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/LocationStrategy.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/LocationStrategy.java index a3f3e78345..338a1e2268 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/LocationStrategy.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/LocationStrategy.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.metron.dataloads.nonbulk.flatfile.location; import org.apache.hadoop.fs.FileSystem; diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/RawLocation.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/RawLocation.java index af01361ef8..744942ee33 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/RawLocation.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/RawLocation.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.metron.dataloads.nonbulk.flatfile.location; import java.io.BufferedReader; diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/URLLocation.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/URLLocation.java index a5e83fa8bf..6cbdc7793c 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/URLLocation.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/URLLocation.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.metron.dataloads.nonbulk.flatfile.location; import java.io.BufferedReader; From a104f464e6b882121c7ab44079a5570d282c8457 Mon Sep 17 00:00:00 2001 From: cstella Date: Tue, 31 Jan 2017 19:28:46 -0500 Subject: [PATCH 16/32] updating script. --- .../flatfile/importer/MapReduceImporter.java | 2 +- .../src/main/scripts/flatfile_loader.sh | 22 ++++++++++++++++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/MapReduceImporter.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/MapReduceImporter.java index 6929a8c396..6b96079b2e 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/MapReduceImporter.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/MapReduceImporter.java @@ -47,7 +47,7 @@ public void importData(EnumMap> config String table = (String) config.get(LoadOptions.HBASE_TABLE).get(); String cf = (String) config.get(LoadOptions.HBASE_CF).get(); String extractorConfigContents = (String) config.get(LoadOptions.EXTRACTOR_CONFIG).get(); - Job job = new Job(hadoopConfig); + Job job = Job.getInstance(hadoopConfig); List inputs = (List) config.get(LoadOptions.INPUT).get(); job.setJobName("MapReduceImporter: " + inputs.stream().collect(Collectors.joining(",")) + " => " + table + ":" + cf); System.out.println("Configuring " + job.getJobName()); diff --git a/metron-platform/metron-data-management/src/main/scripts/flatfile_loader.sh b/metron-platform/metron-data-management/src/main/scripts/flatfile_loader.sh index bba7f8ef24..b9e2746a6a 100755 --- a/metron-platform/metron-data-management/src/main/scripts/flatfile_loader.sh +++ b/metron-platform/metron-data-management/src/main/scripts/flatfile_loader.sh @@ -27,9 +27,25 @@ elif [ -e /usr/lib/bigtop-utils/bigtop-detect-javahome ]; then . /usr/lib/bigtop-utils/bigtop-detect-javahome fi -export HBASE_HOME=${HBASE_HOME:-/usr/hdp/current/hbase-client} export METRON_VERSION=${project.version} export METRON_HOME=/usr/metron/$METRON_VERSION +export CLASSNAME="org.apache.metron.dataloads.nonbulk.flatfile.SimpleEnrichmentFlatFileLoader" export DM_JAR=${project.artifactId}-$METRON_VERSION.jar -CP=$METRON_HOME/lib/$DM_JAR:/usr/metron/${METRON_VERSION}/lib/taxii-1.1.0.1.jar:`${HBASE_HOME}/bin/hbase classpath` -java -cp $CP org.apache.metron.dataloads.nonbulk.flatfile.SimpleEnrichmentFlatFileLoader "$@" +export HBASE_HOME=${HBASE_HOME:-/usr/hdp/current/hbase-client} + +if [ $(which hadoop) ] +then + HADOOP_CLASSPATH=${HBASE_HOME}/lib/hbase-server.jar:`${HBASE_HOME}/bin/hbase classpath` + for jar in $(echo $HADOOP_CLASSPATH | sed 's/:/ /g');do + if [ -f $jar ];then + LIBJARS="$jar,$LIBJARS" + fi + done + export HADOOP_CLASSPATH + hadoop jar $METRON_HOME/lib/$DM_JAR $CLASSNAME -libjars ${LIBJARS} "$@" +else + echo "Warning: Metron cannot find the hadoop client on this node. This means that loading via Map Reduce will NOT function." + CP=$METRON_HOME/lib/$DM_JAR:/usr/metron/${METRON_VERSION}/lib/taxii-1.1.0.1.jar:`${HBASE_HOME}/bin/hbase classpath` + java -cp $CP $CLASSNAME "$@" +fi + From b5a9e5a9243576b27d59e959dfab3e99d34eb761 Mon Sep 17 00:00:00 2001 From: cstella Date: Tue, 31 Jan 2017 19:57:02 -0500 Subject: [PATCH 17/32] Added gzip and zip to regular files --- metron-analytics/metron-statistics/README.md | 2 +- .../metron-data-management/README.md | 21 +++---------------- .../flatfile/location/FileLocation.java | 17 ++++++++++----- 3 files changed, 16 insertions(+), 24 deletions(-) diff --git a/metron-analytics/metron-statistics/README.md b/metron-analytics/metron-statistics/README.md index 7b26c3366e..00b04510d7 100644 --- a/metron-analytics/metron-statistics/README.md +++ b/metron-analytics/metron-statistics/README.md @@ -45,7 +45,7 @@ functions can be used from everywhere where Stellar is used. * Input: * number - The number to take the absolute value of * Returns: The absolute value of the number passed in. -* + #### `BIN` * Description: Computes the bin that the value is in given a set of bounds. * Input: diff --git a/metron-platform/metron-data-management/README.md b/metron-platform/metron-data-management/README.md index 26dd4721f2..f511472d62 100644 --- a/metron-platform/metron-data-management/README.md +++ b/metron-platform/metron-data-management/README.md @@ -206,32 +206,16 @@ The parameters for the utility are as follows: | -n | --enrichment_config | No | The JSON document describing the enrichments to configure. Unlike other loaders, this is run first if specified. | -### Bulk Load from HDFS - -The shell script `$METRON_HOME/bin/threatintel_bulk_load.sh` will kick off a MR job to load data staged in HDFS into an HBase table. Note: despite what -the naming may suggest, this utility works for enrichment as well as threat intel due to the underlying infrastructure being the same. - -The parameters for the utility are as follows: - -| Short Code | Long Code | Is Required? | Description | -|------------|---------------------|--------------|-------------------------------------------------------------------------------------------------------------------| -| -h | | No | Generate the help screen/set of options | -| -e | --extractor_config | Yes | JSON Document describing the extractor for this input data source | -| -t | --table | Yes | The HBase table to import into | -| -f | --column_family | Yes | The HBase table column family to import into | -| -i | --input | Yes | The input data location on HDFS | -| -n | --enrichment_config | No | The JSON document describing the enrichments to configure. Unlike other loaders, this is run first if specified. | -or threat intel. ### Flatfile Loader -The shell script `$METRON_HOME/bin/flatfile_loader.sh` will read data from local disk and load the enrichment or threat intel data into an HBase table. +The shell script `$METRON_HOME/bin/flatfile_loader.sh` will read data from local disk, HDFS or URLs and load the enrichment or threat intel data into an HBase table. Note: This utility works for enrichment as well as threat intel due to the underlying infrastructure being the same. One special thing to note here is that there is a special configuration parameter to the Extractor config that is only considered during this loader: -* inputFormatHandler : This specifies how to consider the data. The two implementations are `BY_LINE` and `org.apache.metron.dataloads.extractor.inputformat.WholeFileFormat`. +* inputFormat : This specifies how to consider the data. The two implementations are `BY_LINE` and `WHOLE_FILE`. The default is `BY_LINE`, which makes sense for a list of CSVs where each line indicates a unit of information which can be imported. @@ -244,6 +228,7 @@ The parameters for the utility are as follows: |------------|---------------------|--------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---| | -h | | No | Generate the help screen/set of options | | | -e | --extractor_config | Yes | JSON Document describing the extractor for this input data source | | +| -m | --import_mode | No | The Import mode to use: LOCAL, MR. Default is MR | | | -t | --hbase_table | Yes | The HBase table to import into | | | -c | --hbase_cf | Yes | The HBase table column family to import into | | | -i | --input | Yes | The input data location on local disk. If this is a file, then that file will be loaded. If this is a directory, then the files will be loaded recursively under that directory. | | diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/FileLocation.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/FileLocation.java index d7121cfee1..3572204a3f 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/FileLocation.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/FileLocation.java @@ -17,13 +17,12 @@ */ package org.apache.metron.dataloads.nonbulk.flatfile.location; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; +import java.io.*; import java.util.ArrayList; import java.util.List; import java.util.Optional; +import java.util.zip.GZIPInputStream; +import java.util.zip.ZipInputStream; public class FileLocation implements RawLocation { @Override @@ -47,7 +46,15 @@ public boolean isDirectory(String loc) throws IOException { @Override public BufferedReader openReader(String loc) throws IOException { - return new BufferedReader(new FileReader(loc)); + if(loc.endsWith(".gz")) { + return new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(loc)))); + } + else if(loc.endsWith(".zip")){ + return new BufferedReader(new InputStreamReader(new ZipInputStream(new FileInputStream(loc)))); + } + else { + return new BufferedReader(new FileReader(loc)); + } } @Override From 323267ddfb52ab1aa7488e02643a8158044797e2 Mon Sep 17 00:00:00 2001 From: cstella Date: Wed, 1 Feb 2017 10:04:53 -0500 Subject: [PATCH 18/32] Fixed stupid zip issue. --- .../flatfile/importer/LocalImporter.java | 1 + .../flatfile/location/FileLocation.java | 13 ++------- .../flatfile/location/HDFSLocation.java | 5 ++-- .../nonbulk/flatfile/location/Location.java | 4 +++ .../flatfile/location/RawLocation.java | 29 +++++++++++++++++-- .../flatfile/location/URLLocation.java | 19 ++++-------- ...richmentFlatFileLoaderIntegrationTest.java | 2 ++ 7 files changed, 45 insertions(+), 28 deletions(-) diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java index 45755130c5..1207700548 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java @@ -96,6 +96,7 @@ public void extractLineByLine( List inputs ) throws IOException { inputs.stream().map(input -> LocationStrategy.getLocation(input, state.get().getFileSystem())) .forEach( loc -> { + System.out.println("Processing " + loc.toString() + " using " + loc.getRawLocation().getClass()); try (Stream stream = ReaderSpliterator.lineStream(loc.openReader(), batchSize)) { ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads); diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/FileLocation.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/FileLocation.java index 3572204a3f..267a6fb524 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/FileLocation.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/FileLocation.java @@ -22,6 +22,7 @@ import java.util.List; import java.util.Optional; import java.util.zip.GZIPInputStream; +import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; public class FileLocation implements RawLocation { @@ -45,16 +46,8 @@ public boolean isDirectory(String loc) throws IOException { } @Override - public BufferedReader openReader(String loc) throws IOException { - if(loc.endsWith(".gz")) { - return new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(loc)))); - } - else if(loc.endsWith(".zip")){ - return new BufferedReader(new InputStreamReader(new ZipInputStream(new FileInputStream(loc)))); - } - else { - return new BufferedReader(new FileReader(loc)); - } + public InputStream openInputStream(String loc) throws IOException { + return new FileInputStream(loc); } @Override diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/HDFSLocation.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/HDFSLocation.java index 67140fb57f..bae6a828f9 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/HDFSLocation.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/HDFSLocation.java @@ -23,6 +23,7 @@ import java.io.BufferedReader; import java.io.IOException; +import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; @@ -52,8 +53,8 @@ public boolean isDirectory(String loc) throws IOException { } @Override - public BufferedReader openReader(String loc) throws IOException { - return new BufferedReader(new InputStreamReader(fs.open(new Path(loc)))); + public InputStream openInputStream(String loc) throws IOException { + return fs.open(new Path(loc)); } @Override diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/Location.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/Location.java index ab7ddb521e..81eada62a7 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/Location.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/Location.java @@ -38,6 +38,10 @@ public Location(String loc, RawLocation rawLocation) { } + public RawLocation getRawLocation() { + return rawLocation; + } + public Optional> getChildren() throws IOException { if(exists() && isDirectory()) { List children = new ArrayList<>(); diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/RawLocation.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/RawLocation.java index 744942ee33..5f2db33eb6 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/RawLocation.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/RawLocation.java @@ -17,18 +17,41 @@ */ package org.apache.metron.dataloads.nonbulk.flatfile.location; -import java.io.BufferedReader; -import java.io.IOException; +import java.io.*; import java.util.List; import java.util.Optional; +import java.util.zip.GZIPInputStream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; public interface RawLocation { Optional> list(String loc) throws IOException; boolean exists(String loc) throws IOException; boolean isDirectory(String loc) throws IOException; - BufferedReader openReader(String loc) throws IOException; + + InputStream openInputStream(String loc) throws IOException; boolean match(String loc); default void init(T state) { } + + default BufferedReader openReader(String loc) throws IOException { + InputStream is = openInputStream(loc); + if(loc.endsWith(".gz")) { + return new BufferedReader(new InputStreamReader(new GZIPInputStream(is))); + } + else if(loc.endsWith(".zip")) { + ZipInputStream zis = new ZipInputStream(is); + ZipEntry entry = zis.getNextEntry(); + if(entry != null) { + return new BufferedReader(new InputStreamReader(zis)); + } + else { + return new BufferedReader(new InputStreamReader(new ByteArrayInputStream(new byte[] {}))); + } + } + else { + return new BufferedReader(new InputStreamReader(is)); + } + } } diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/URLLocation.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/URLLocation.java index 6cbdc7793c..cc8edbeff5 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/URLLocation.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/location/URLLocation.java @@ -17,16 +17,16 @@ */ package org.apache.metron.dataloads.nonbulk.flatfile.location; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; +import java.io.*; +import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.util.Collections; import java.util.List; import java.util.Optional; import java.util.zip.GZIPInputStream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; import java.util.zip.ZipInputStream; public class URLLocation implements RawLocation { @@ -47,15 +47,8 @@ public boolean isDirectory(String loc) throws IOException { } @Override - public BufferedReader openReader(String loc) throws IOException { - InputStream is = new URL(loc).openStream(); - if(loc.endsWith(".zip")) { - is = new ZipInputStream(is); - } - else if(loc.endsWith(".gz")) { - is = new GZIPInputStream(is); - } - return new BufferedReader(new InputStreamReader(is)); + public InputStream openInputStream(String loc) throws IOException { + return new URL(loc).openConnection().getInputStream(); } @Override diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java index 0f79220c82..8d61f6a2b6 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java @@ -69,6 +69,8 @@ public class SimpleEnrichmentFlatFileLoaderIntegrationTest { private static final File file1 = new File("target/sefflt_data_1.csv"); private static final File file2 = new File("target/sefflt_data_2.csv"); private static final File multilineFile= new File("target/sefflt_data_2.csv"); + private static final File multilineZipFile= new File("target/sefflt_data_2.csv.zip"); + private static final File multilineGzFile= new File("target/sefflt_data_2.csv.gz"); private static final File lineByLineExtractorConfigFile = new File("target/sefflt_extractorConfig_lbl.json"); private static final File wholeFileExtractorConfigFile = new File("target/sefflt_extractorConfig_wf.json"); private static final int NUM_LINES = 1000; From bc26b5b3992b91097bb4fc4b214d4b6bacaddfbb Mon Sep 17 00:00:00 2001 From: cstella Date: Wed, 1 Feb 2017 11:27:58 -0500 Subject: [PATCH 19/32] Updating readme and making progress bar optional and better. --- .../metron-data-management/README.md | 3 +- .../nonbulk/flatfile/LoadOptions.java | 13 ++++ .../flatfile/importer/LocalImporter.java | 60 +++++++++++++------ 3 files changed, 56 insertions(+), 20 deletions(-) diff --git a/metron-platform/metron-data-management/README.md b/metron-platform/metron-data-management/README.md index f511472d62..eaafda4e74 100644 --- a/metron-platform/metron-data-management/README.md +++ b/metron-platform/metron-data-management/README.md @@ -227,8 +227,9 @@ The parameters for the utility are as follows: | Short Code | Long Code | Is Required? | Description | | |------------|---------------------|--------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---| | -h | | No | Generate the help screen/set of options | | +| -q | --quiet | No | Do not update progress | -e | --extractor_config | Yes | JSON Document describing the extractor for this input data source | | -| -m | --import_mode | No | The Import mode to use: LOCAL, MR. Default is MR | | +| -m | --import_mode | No | The Import mode to use: LOCAL, MR. Default: LOCAL | | | -t | --hbase_table | Yes | The HBase table to import into | | | -c | --hbase_cf | Yes | The HBase table column family to import into | | | -i | --input | Yes | The input data location on local disk. If this is a file, then that file will be loaded. If this is a directory, then the files will be loaded recursively under that directory. | | diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java index e418da6adf..76a9b2f2af 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java @@ -41,6 +41,19 @@ public Option apply(@Nullable String s) { return new Option(s, "help", false, "Generate Help screen"); } }) + ,QUIET("q", new OptionHandler() { + + @Nullable + @Override + public Option apply(@Nullable String s) { + return new Option(s, "quiet", false, "Do not update progress"); + } + + @Override + public Optional getValue(LoadOptions option, CommandLine cli) { + return Optional.of(option.has(cli)); + } + }) , IMPORT_MODE("m", new OptionHandler() { @Nullable @Override diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java index 1207700548..2bae225b27 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java @@ -37,6 +37,7 @@ import java.io.*; import java.util.*; import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Consumer; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -73,17 +74,17 @@ protected ExtractorState initialValue() { } } }; - + boolean quiet = (boolean) config.get(LoadOptions.QUIET).get(); boolean lineByLine = !handler.getInputFormat().getClass().equals(WholeFileFormat.class); List inputs = (List) config.get(LoadOptions.INPUT).get(); String cf = (String) config.get(LoadOptions.HBASE_CF).get(); if(!lineByLine) { - extractWholeFiles(inputs, state, cf); + extractWholeFiles(inputs, state, cf, quiet); } else { int batchSize = (int) config.get(LoadOptions.BATCH_SIZE).get(); int numThreads = (int) config.get(LoadOptions.NUM_THREADS).get(); - extractLineByLine(inputs, state, cf, batchSize, numThreads); + extractLineByLine(inputs, state, cf, batchSize, numThreads, quiet); } } @@ -93,38 +94,46 @@ public void extractLineByLine( List inputs , String cf , int batchSize , int numThreads + , boolean quiet ) throws IOException { inputs.stream().map(input -> LocationStrategy.getLocation(input, state.get().getFileSystem())) .forEach( loc -> { - System.out.println("Processing " + loc.toString() + " using " + loc.getRawLocation().getClass()); - try (Stream stream = ReaderSpliterator.lineStream(loc.openReader(), batchSize)) { - - ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads); - forkJoinPool.submit(() -> - stream.parallel().forEach(input -> { - ExtractorState es = state.get(); - try { - es.getTable().put(extract(input, es.getExtractor(), cf, es.getConverter())); - } catch (IOException e) { - throw new IllegalStateException("Unable to continue: " + e.getMessage(), e); - } - } + final Progress progress = new Progress(); + System.out.println("Processing " + loc.toString()); + try (Stream stream = ReaderSpliterator.lineStream(loc.openReader(), batchSize)) { + ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads); + forkJoinPool.submit(() -> + stream.parallel().forEach(input -> { + ExtractorState es = state.get(); + try { + es.getTable().put(extract(input, es.getExtractor(), cf, es.getConverter(), progress, quiet)); + } catch (IOException e) { + throw new IllegalStateException("Unable to continue: " + e.getMessage(), e); + } + } ) ).get(); } catch (Exception e) { throw new IllegalStateException(e.getMessage(), e); } - } + } ); } - public void extractWholeFiles( List inputs, ThreadLocal state, String cf) throws IOException { + public void extractWholeFiles( List inputs, ThreadLocal state, String cf, boolean quiet) throws IOException { + final Progress progress = new Progress(); final List locations = new ArrayList<>(); Location.fileVisitor(inputs, loc -> locations.add(loc), state.get().getFileSystem()); locations.parallelStream().forEach(loc -> { try(BufferedReader br = loc.openReader()) { String s = br.lines().collect(Collectors.joining()); - state.get().getTable().put(extract(s, state.get().getExtractor(), cf, state.get().getConverter())); + state.get().getTable().put(extract( s + , state.get().getExtractor() + , cf, state.get().getConverter() + , progress + , quiet + ) + ); } catch (IOException e) { throw new IllegalStateException("Unable to read " + loc + ": " + e.getMessage(), e); } @@ -136,6 +145,8 @@ public List extract(String line , Extractor extractor , String cf , HbaseConverter converter + , final Progress progress + , final boolean quiet ) throws IOException { List ret = new ArrayList<>(); @@ -144,10 +155,21 @@ public List extract(String line Put put = converter.toPut(cf, kv.getKey(), kv.getValue()); ret.add(put); } + if(!quiet) { + progress.update(); + } return ret; } + public static class Progress { + private int count = 0; + private String anim= "|/-\\"; + public synchronized void update() { + int currentCount = count++; + System.out.print("\rProcessed " + currentCount + " - " + anim.charAt(currentCount % anim.length())); + } + } } From 6cdf35d94f72be7da524fd5f854876f131ddb9f9 Mon Sep 17 00:00:00 2001 From: cstella Date: Wed, 1 Feb 2017 12:39:59 -0500 Subject: [PATCH 20/32] updating tests to include gzip and zip --- .../flatfile/importer/LocalImporter.java | 4 +- ...richmentFlatFileLoaderIntegrationTest.java | 89 +++++++++++++++++-- 2 files changed, 86 insertions(+), 7 deletions(-) diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java index 2bae225b27..652a4c3284 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/LocalImporter.java @@ -99,7 +99,9 @@ public void extractLineByLine( List inputs inputs.stream().map(input -> LocationStrategy.getLocation(input, state.get().getFileSystem())) .forEach( loc -> { final Progress progress = new Progress(); - System.out.println("Processing " + loc.toString()); + if(!quiet) { + System.out.println("\nProcessing " + loc.toString()); + } try (Stream stream = ReaderSpliterator.lineStream(loc.openReader(), batchSize)) { ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads); forkJoinPool.submit(() -> diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java index 8d61f6a2b6..d0d637d4b6 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java @@ -43,6 +43,7 @@ import org.junit.*; import java.io.File; +import java.io.FileOutputStream; import java.io.PrintWriter; import java.nio.file.Files; import java.nio.file.OpenOption; @@ -52,6 +53,10 @@ import java.util.Map; import java.util.logging.Level; import java.util.stream.Stream; +import java.util.zip.GZIPOutputStream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; +import java.util.zip.ZipOutputStream; public class SimpleEnrichmentFlatFileLoaderIntegrationTest { @@ -155,9 +160,31 @@ public static void setup() throws Exception { if(multilineFile.exists()) { multilineFile.delete(); } - try(PrintWriter pw = new PrintWriter(multilineFile)) { + if(multilineGzFile.exists()) { + multilineGzFile.delete(); + } + if(multilineGzFile.exists()) { + multilineZipFile.delete(); + } + PrintWriter[] pws =new PrintWriter[] {}; + try { + ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(multilineZipFile)); + ZipEntry entry = new ZipEntry("file"); + zos.putNextEntry(entry); + pws = new PrintWriter[]{ + new PrintWriter(multilineFile), + new PrintWriter(zos), + new PrintWriter(new GZIPOutputStream(new FileOutputStream(multilineGzFile))) + }; for(int i = 0;i < NUM_LINES;++i) { - pw.println("google" + i + ".com," + i + ",foo" + i); + for(PrintWriter pw : pws) { + pw.println("google" + i + ".com," + i + ",foo" + i); + } + } + } + finally { + for(PrintWriter pw : pws) { + pw.close(); } } @@ -169,6 +196,8 @@ public static void teardown() throws Exception { file1.delete(); file2.delete(); multilineFile.delete(); + multilineGzFile.delete(); + multilineZipFile.delete(); lineByLineExtractorConfigFile.delete(); wholeFileExtractorConfigFile.delete(); } @@ -179,7 +208,7 @@ public void testArgs() throws Exception { String[] argv = {"-c cf", "-t enrichment" , "-e extractor.json", "-n enrichment_config.json" , "-l log4j", "-i input.csv" - , "-p 2", "-b 128" + , "-p 2", "-b 128", "-q" }; String[] otherArgs = new GenericOptionsParser(config, argv).getRemainingArgs(); @@ -200,7 +229,55 @@ public void testLocalLineByLine() throws Exception { String[] argv = {"-c cf", "-t enrichment" , "-e " + lineByLineExtractorConfigFile.getPath() , "-i " + multilineFile.getPath() - , "-p 2", "-b 128" + , "-p 2", "-b 128", "-q" + }; + SimpleEnrichmentFlatFileLoader.main(config, argv); + EnrichmentConverter converter = new EnrichmentConverter(); + ResultScanner scanner = testTable.getScanner(Bytes.toBytes(cf)); + List> results = new ArrayList<>(); + for (Result r : scanner) { + results.add(converter.fromResult(r, cf)); + testTable.delete(new Delete(r.getRow())); + } + Assert.assertEquals(NUM_LINES, results.size()); + Assert.assertTrue(results.get(0).getKey().indicator.startsWith("google")); + Assert.assertEquals(results.get(0).getKey().type, "enrichment"); + Assert.assertEquals(results.get(0).getValue().getMetadata().size(), 2); + Assert.assertTrue(results.get(0).getValue().getMetadata().get("meta").toString().startsWith("foo")); + Assert.assertTrue(results.get(0).getValue().getMetadata().get("host").toString().startsWith("google")); + + } + + @Test + public void testLocalLineByLine_gz() throws Exception { + String[] argv = {"-c cf", "-t enrichment" + , "-e " + lineByLineExtractorConfigFile.getPath() + , "-i " + multilineGzFile.getPath() + , "-p 2", "-b 128", "-q" + }; + SimpleEnrichmentFlatFileLoader.main(config, argv); + EnrichmentConverter converter = new EnrichmentConverter(); + ResultScanner scanner = testTable.getScanner(Bytes.toBytes(cf)); + List> results = new ArrayList<>(); + for (Result r : scanner) { + results.add(converter.fromResult(r, cf)); + testTable.delete(new Delete(r.getRow())); + } + Assert.assertEquals(NUM_LINES, results.size()); + Assert.assertTrue(results.get(0).getKey().indicator.startsWith("google")); + Assert.assertEquals(results.get(0).getKey().type, "enrichment"); + Assert.assertEquals(results.get(0).getValue().getMetadata().size(), 2); + Assert.assertTrue(results.get(0).getValue().getMetadata().get("meta").toString().startsWith("foo")); + Assert.assertTrue(results.get(0).getValue().getMetadata().get("host").toString().startsWith("google")); + + } + + @Test + public void testLocalLineByLine_zip() throws Exception { + String[] argv = {"-c cf", "-t enrichment" + , "-e " + lineByLineExtractorConfigFile.getPath() + , "-i " + multilineZipFile.getPath() + , "-p 2", "-b 128", "-q" }; SimpleEnrichmentFlatFileLoader.main(config, argv); EnrichmentConverter converter = new EnrichmentConverter(); @@ -224,7 +301,7 @@ public void testLocalWholeFile() throws Exception { String[] argv = { "-c cf", "-t enrichment" , "-e " + wholeFileExtractorConfigFile.getPath() , "-i " + file1.getPath() + "," + file2.getPath() - , "-p 2", "-b 128" + , "-p 2", "-b 128", "-q" }; SimpleEnrichmentFlatFileLoader.main(config, argv); EnrichmentConverter converter = new EnrichmentConverter(); @@ -249,7 +326,7 @@ public void testMRLineByLine() throws Exception { , "-e " + lineByLineExtractorConfigFile.getPath() , "-i " + multilineFile.getName() , "-m MR" - , "-p 2", "-b 128" + , "-p 2", "-b 128", "-q" }; FileSystem fs = FileSystem.get(config); HBaseUtil.INSTANCE.writeFile(new String(Files.readAllBytes(multilineFile.toPath())), new Path(multilineFile.getName()), fs); From fd718bffa5e97f2c5c510b38d6a6d3812aefbed9 Mon Sep 17 00:00:00 2001 From: Michael Miklavcic Date: Wed, 1 Feb 2017 11:57:04 -0700 Subject: [PATCH 21/32] Refactor --- .../metron/common/utils/ConversionUtils.java | 8 ++ .../common/utils/ConversionUtilsTest.java | 20 ++++ .../TransformFilterExtractorDecorator.java | 101 ++++++++++-------- 3 files changed, 87 insertions(+), 42 deletions(-) diff --git a/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/ConversionUtils.java b/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/ConversionUtils.java index 97b36ee265..a83a144b22 100644 --- a/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/ConversionUtils.java +++ b/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/ConversionUtils.java @@ -35,6 +35,14 @@ protected ConvertUtilsBean initialValue() { } }; + public static T convertOrFail(Object o, Class clazz) { + if (clazz.isInstance(o)) { + return convert(o, clazz); + } else { + throw new IllegalArgumentException(String.format("Object is not of type %s", clazz.getCanonicalName())); + } + } + public static T convert(Object o, Class clazz) { if (o == null) { return null; diff --git a/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/ConversionUtilsTest.java b/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/ConversionUtilsTest.java index 90d00e4c75..06c767ab9e 100644 --- a/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/ConversionUtilsTest.java +++ b/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/ConversionUtilsTest.java @@ -19,9 +19,12 @@ package org.apache.metron.common.utils; import org.junit.Assert; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; public class ConversionUtilsTest { + @Test public void testIntegerConversions() { Object o = 1; @@ -29,4 +32,21 @@ public void testIntegerConversions() { Assert.assertEquals(Integer.valueOf(1), ConversionUtils.convert("1", Integer.class)); Assert.assertNull(ConversionUtils.convert("foo", Integer.class)); } + + @Test + public void same_object_type_hierarchy_will_pass_convertOrFail() { + Assert.assertEquals(new Integer(5), ConversionUtils.convertOrFail(new Integer(5), Integer.class)); + Assert.assertEquals(new Integer(5), ConversionUtils.convertOrFail(new Integer(5), Number.class)); + } + + @Rule + public final ExpectedException exception = ExpectedException.none(); + + @Test + public void different_object_types_will_fail_convertOrFail() { + exception.expect(IllegalArgumentException.class); + exception.expectMessage("Object is not of type java.lang.String"); + ConversionUtils.convertOrFail(new Integer(5), String.class); + } + } diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java index 2b33ad27e2..faea4ca547 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java @@ -10,6 +10,7 @@ import org.apache.metron.common.dsl.StellarFunctions; import org.apache.metron.common.stellar.StellarPredicateProcessor; import org.apache.metron.common.stellar.StellarProcessor; +import org.apache.metron.common.utils.ConversionUtils; import org.apache.metron.common.utils.JSONUtils; import org.apache.metron.enrichment.lookup.LookupKV; @@ -24,6 +25,7 @@ public class TransformFilterExtractorDecorator extends ExtractorDecorator { private static final String INDICATOR_TRANSFORM = "indicator_transform"; private static final String INDICATOR_FILTER = "indicator_filter"; private static final String ZK_QUORUM = "zk_quorum"; + private static final String INDICATOR = "indicator"; private Map valueTransforms; private Map indicatorTransforms; private String valueFilter; @@ -35,30 +37,30 @@ public class TransformFilterExtractorDecorator extends ExtractorDecorator { public TransformFilterExtractorDecorator(Extractor decoratedExtractor) { super(decoratedExtractor); + this.valueTransforms = new HashMap<>(); + this.indicatorTransforms = new HashMap<>(); + this.valueFilter = ""; + this.indicatorFilter = ""; } @Override public void initialize(Map config) { super.initialize(config); if (config.containsKey(VALUE_TRANSFORM)) { - this.valueTransforms = getTransforms(config.get(VALUE_TRANSFORM)); - } else { - this.valueTransforms = new HashMap<>(); + this.valueTransforms = getTransforms(config, VALUE_TRANSFORM); } if (config.containsKey(INDICATOR_TRANSFORM)) { - this.indicatorTransforms = getTransforms(config.get(INDICATOR_TRANSFORM)); - } else { - this.indicatorTransforms = new HashMap<>(); + this.indicatorTransforms = getTransforms(config, INDICATOR_TRANSFORM); } if (config.containsKey(VALUE_FILTER)) { - this.valueFilter = config.get(VALUE_FILTER).toString(); + this.valueFilter = getFilter(config, VALUE_FILTER); } if (config.containsKey(INDICATOR_FILTER)) { - this.indicatorFilter = config.get(INDICATOR_FILTER).toString(); + this.indicatorFilter = getFilter(config, INDICATOR_FILTER); } String zkClientUrl = ""; if (config.containsKey(ZK_QUORUM)) { - zkClientUrl = config.get(ZK_QUORUM).toString(); + zkClientUrl = ConversionUtils.convert(config.get(ZK_QUORUM), String.class); } Optional zkClient = createClient(zkClientUrl); this.globalConfig = getGlobalConfig(zkClient); @@ -68,13 +70,23 @@ public void initialize(Map config) { this.filterProcessor = new StellarPredicateProcessor(); } - private Map getTransforms(Object transformsConfig) { + private String getFilter(Map config, String valueFilter) { + return ConversionUtils.convertOrFail(config.get(valueFilter), String.class); + } + + /** + * Get a map of the transformations from the config of the specified type + * @param config main config map + * @param type the transformation type to get from config + * @return map of transformations. + */ + private Map getTransforms(Map config, String type) { + Map transformsConfig = ConversionUtils.convertOrFail(config.get(type), Map.class); Map transforms = new HashMap<>(); - if (transformsConfig instanceof Map) { - Map map = (Map) transformsConfig; - for (Map.Entry e : map.entrySet()) { - transforms.put(e.getKey().toString(), e.getValue().toString()); - } + for (Map.Entry e : transformsConfig.entrySet()) { + String key = ConversionUtils.convertOrFail(e.getKey(), String.class); + String val = ConversionUtils.convertOrFail(e.getValue(), String.class); + transforms.put(key, val); } return transforms; } @@ -129,41 +141,46 @@ public Iterable extract(String line) throws IOException { return lkvs; } + /** + * Returns true if lookupkv is not null after transforms and filtering on the value and indicator key + * @param lkv LookupKV to transform and filter + * @return true if lkv is not null after transform/filter + */ private boolean updateLookupKV(LookupKV lkv) { Map ret = lkv.getValue().getMetadata(); - MapVariableResolver metadataResolver = new MapVariableResolver(ret, globalConfig); - for (Map.Entry entry : valueTransforms.entrySet()) { - Object o = transformProcessor.parse(entry.getValue(), metadataResolver, StellarFunctions.FUNCTION_RESOLVER(), stellarContext); - if (o == null) { - ret.remove(entry.getKey()); - } else { - ret.put(entry.getKey(), o); - } - } - // update key - // transform + Map ind = new HashMap<>(); String indicator = lkv.getKey().getIndicator(); // add indicator as a resolvable variable. Also enable using resolved/transformed variables and values from operating on the value metadata - Map ind = new HashMap<>(); - ind.putAll(ret); - ind.put("indicator", indicator); - MapVariableResolver indicatorResolver = new MapVariableResolver(ind, globalConfig); - for (Map.Entry entry : indicatorTransforms.entrySet()) { - Object o = transformProcessor.parse(entry.getValue(), indicatorResolver, StellarFunctions.FUNCTION_RESOLVER(), stellarContext); + ind.put(INDICATOR, indicator); + MapVariableResolver resolver = new MapVariableResolver(ret, ind, globalConfig); + transform(valueTransforms, ret, resolver); + transform(indicatorTransforms, ind, resolver); + // update indicator + Object updatedIndicator = ind.get(INDICATOR); + if (updatedIndicator != null) { + if (!(updatedIndicator instanceof String)) { + throw new UnsupportedOperationException("Indicator transform must return String type"); + } + lkv.getKey().setIndicator((String) updatedIndicator); + return filter(indicatorFilter, resolver) && filter(valueFilter, resolver); + } else { + return false; + } + } + + private void transform(Map transforms, Map variableMap, MapVariableResolver variableResolver) { + for (Map.Entry entry : transforms.entrySet()) { + Object o = transformProcessor.parse(entry.getValue(), variableResolver, StellarFunctions.FUNCTION_RESOLVER(), stellarContext); if (o == null) { - ind.remove(entry.getKey()); + variableMap.remove(entry.getKey()); } else { - ind.put(entry.getKey(), o); + variableMap.put(entry.getKey(), o); } } - // update indicator - if (ind.get("indicator") != null) { - lkv.getKey().setIndicator(ind.get("indicator").toString()); - } - // filter on indicator not being empty and both filters passing muster - return (ind.get("indicator") != null) - && filterProcessor.parse(indicatorFilter, metadataResolver, StellarFunctions.FUNCTION_RESOLVER(), stellarContext) - && filterProcessor.parse(valueFilter, metadataResolver, StellarFunctions.FUNCTION_RESOLVER(), stellarContext); + } + + private Boolean filter(String filterPredicate, MapVariableResolver variableResolver) { + return filterProcessor.parse(filterPredicate, variableResolver, StellarFunctions.FUNCTION_RESOLVER(), stellarContext); } } From d24f0c974d27e3861cb431c48efb3380a372e58b Mon Sep 17 00:00:00 2001 From: Michael Miklavcic Date: Thu, 2 Feb 2017 12:03:56 -0700 Subject: [PATCH 22/32] Get unit test for extractor decorator working --- .../TransformFilterExtractorDecorator.java | 42 +++++---- .../extractor/ExtractorDecoratorTest.java | 42 +++++++++ ...TransformFilterExtractorDecoratorTest.java | 92 +++++++++++++++++++ .../SimpleEnrichmentFlatFileLoaderTest.java | 15 ++- 4 files changed, 172 insertions(+), 19 deletions(-) create mode 100644 metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/ExtractorDecoratorTest.java create mode 100644 metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecoratorTest.java diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java index faea4ca547..fc05ee126f 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java @@ -26,6 +26,7 @@ public class TransformFilterExtractorDecorator extends ExtractorDecorator { private static final String INDICATOR_FILTER = "indicator_filter"; private static final String ZK_QUORUM = "zk_quorum"; private static final String INDICATOR = "indicator"; + private Optional zkClient; private Map valueTransforms; private Map indicatorTransforms; private String valueFilter; @@ -37,8 +38,9 @@ public class TransformFilterExtractorDecorator extends ExtractorDecorator { public TransformFilterExtractorDecorator(Extractor decoratedExtractor) { super(decoratedExtractor); - this.valueTransforms = new HashMap<>(); - this.indicatorTransforms = new HashMap<>(); + this.zkClient = Optional.empty(); + this.valueTransforms = new LinkedHashMap<>(); + this.indicatorTransforms = new LinkedHashMap<>(); this.valueFilter = ""; this.indicatorFilter = ""; } @@ -62,7 +64,7 @@ public void initialize(Map config) { if (config.containsKey(ZK_QUORUM)) { zkClientUrl = ConversionUtils.convert(config.get(ZK_QUORUM), String.class); } - Optional zkClient = createClient(zkClientUrl); + zkClient = setupClient(zkClient, zkClientUrl); this.globalConfig = getGlobalConfig(zkClient); this.stellarContext = createContext(zkClient); StellarFunctions.initialize(stellarContext); @@ -82,7 +84,7 @@ private String getFilter(Map config, String valueFilter) { */ private Map getTransforms(Map config, String type) { Map transformsConfig = ConversionUtils.convertOrFail(config.get(type), Map.class); - Map transforms = new HashMap<>(); + Map transforms = new LinkedHashMap<>(); for (Map.Entry e : transformsConfig.entrySet()) { String key = ConversionUtils.convertOrFail(e.getKey(), String.class); String val = ConversionUtils.convertOrFail(e.getValue(), String.class); @@ -92,18 +94,22 @@ private Map getTransforms(Map config, String typ } /** - * Creates a Zookeeper client. + * Creates a Zookeeper client if it doesn't exist and a url for zk is provided. * @param zookeeperUrl The Zookeeper URL. */ - private Optional createClient(String zookeeperUrl) { - // can only create client, if have valid zookeeper URL - if (StringUtils.isNotBlank(zookeeperUrl)) { - CuratorFramework client = ConfigurationsUtils.getClient(zookeeperUrl); - client.start(); - return Optional.of(client); + private Optional setupClient(Optional zkClient, String zookeeperUrl) { + // can only create client if we have a valid zookeeper URL + if (!zkClient.isPresent()) { + if (StringUtils.isNotBlank(zookeeperUrl)) { + CuratorFramework client = ConfigurationsUtils.getClient(zookeeperUrl); + client.start(); + return Optional.of(client); + } else { + LOG.warn("Unable to setup zookeeper client - zk_quorum url not provided. **This will limit some Stellar functionality**"); + return Optional.empty(); + } } else { - LOG.warn("Unable to setup zookeeper client - zk_quorum url not provided. **This will limit some Stellar functionality**"); - return Optional.empty(); + return zkClient; } } @@ -118,13 +124,13 @@ private Map getGlobalConfig(Optional zkClient) LOG.warn("Exception thrown while attempting to get global config from Zookeeper.", e); } } - return new HashMap<>(); + return new LinkedHashMap<>(); } private Context createContext(Optional zkClient) { Context.Builder builder = new Context.Builder(); if (zkClient.isPresent()) { - builder.with(Context.Capabilities.ZOOKEEPER_CLIENT, () -> zkClient.get()) + builder.with(Context.Capabilities.ZOOKEEPER_CLIENT, zkClient::get) .with(Context.Capabilities.GLOBAL_CONFIG, () -> globalConfig); } return builder.build(); @@ -148,7 +154,7 @@ public Iterable extract(String line) throws IOException { */ private boolean updateLookupKV(LookupKV lkv) { Map ret = lkv.getValue().getMetadata(); - Map ind = new HashMap<>(); + Map ind = new LinkedHashMap<>(); String indicator = lkv.getKey().getIndicator(); // add indicator as a resolvable variable. Also enable using resolved/transformed variables and values from operating on the value metadata ind.put(INDICATOR, indicator); @@ -183,4 +189,8 @@ private Boolean filter(String filterPredicate, MapVariableResolver variableResol return filterProcessor.parse(filterPredicate, variableResolver, StellarFunctions.FUNCTION_RESOLVER(), stellarContext); } + protected void setZkClient(Optional zkClient) { + this.zkClient = zkClient; + } + } diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/ExtractorDecoratorTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/ExtractorDecoratorTest.java new file mode 100644 index 0000000000..caa7e1d1d9 --- /dev/null +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/ExtractorDecoratorTest.java @@ -0,0 +1,42 @@ +package org.apache.metron.dataloads.extractor; + +import org.hamcrest.Matchers; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import static org.mockito.Matchers.isA; +import static org.mockito.Mockito.verify; + +public class ExtractorDecoratorTest { + + @Mock + Extractor extractor; + + @Before + public void before() { + MockitoAnnotations.initMocks(this); + } + + @Test + public void sets_member_variables() { + ExtractorDecorator decorator = new ExtractorDecorator(extractor); + Assert.assertThat(decorator.decoratedExtractor, Matchers.notNullValue()); + } + + @Test + public void calls_extractor_methods() throws IOException { + ExtractorDecorator decorator = new ExtractorDecorator(extractor); + decorator.initialize(new HashMap()); + decorator.extract("line"); + verify(extractor).initialize(isA(Map.class)); + verify(extractor).extract("line"); + } + +} diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecoratorTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecoratorTest.java new file mode 100644 index 0000000000..e3c58d325b --- /dev/null +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecoratorTest.java @@ -0,0 +1,92 @@ +package org.apache.metron.dataloads.extractor; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.adrianwalker.multilinestring.Multiline; +import org.apache.curator.framework.CuratorFramework; +import org.apache.metron.enrichment.converter.EnrichmentKey; +import org.apache.metron.enrichment.converter.EnrichmentValue; +import org.apache.metron.enrichment.lookup.LookupKV; +import org.apache.metron.enrichment.lookup.LookupKey; +import org.apache.metron.enrichment.lookup.LookupValue; +import org.hamcrest.CoreMatchers; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mock; +import org.mockito.Mockito; +import org.mockito.MockitoAnnotations; + +import java.io.IOException; +import java.util.*; + +public class TransformFilterExtractorDecoratorTest { + + LinkedHashMap config1; + @Mock + CuratorFramework zkClient; + @Mock + Extractor extractor; + + @Before + public void setup() throws Exception { + MockitoAnnotations.initMocks(this); + config1 = new ObjectMapper().readValue(config1Contents, LinkedHashMap.class); +// HashMap config2 = new ObjectMapper().readValue(config1Contents, HashMap.class); + } + + /** + * { + * "zk_quorum" : "blah", + * "columns" : { + * "foo" : 0, + * "bar" : 1, + * "baz" : 2 + * }, + * "value_transform" : { + * "foo" : "TO_UPPER(foo)", + * "newvar" : "foo", + * "lowernewvar" : "TO_LOWER(newvar)" + * }, + * "value_filter" : "LENGTH(baz) > 0", + * "indicator_column" : "bar", + * "indicator_transform" : { + * "somevar" : "indicator", + * "indicator" : "TO_UPPER(somevar)" + * }, + * "indicator_filter" : "LENGTH(indicator) > 0", + * "type" : "testenrichment", + * "separator" : "," + * } + *} + */ + @Multiline + public static String config1Contents; + + @Test + public void simple_transform_value() throws IOException { + LookupKey lookupKey = new EnrichmentKey("testenrichment", "val2"); + LookupValue lookupValue = new EnrichmentValue(new HashMap() {{ + put("foo", "val1"); + put("bar", "val2"); + put("baz", "val3"); + }}); + LookupKV lkv = new LookupKV(lookupKey, lookupValue); + List lkvs = new ArrayList<>(); + lkvs.add(lkv); + Mockito.when(extractor.extract("val1,val2,val3")).thenReturn(lkvs); + TransformFilterExtractorDecorator decorator = new TransformFilterExtractorDecorator(extractor); + decorator.setZkClient(Optional.of(zkClient)); + decorator.initialize(config1); + Iterable extracted = decorator.extract("val1,val2,val3"); + Assert.assertThat(extracted, CoreMatchers.equalTo(lkvs)); + } + + // TODO + + // simple filter value + + // simple transform indicator + + // simple filter indicator + +} diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java index 748adc7b63..15e0384562 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java @@ -21,6 +21,7 @@ import org.adrianwalker.multilinestring.Multiline; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.PosixParser; +import org.apache.curator.test.TestingServer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HBaseTestingUtility; @@ -60,6 +61,9 @@ public class SimpleEnrichmentFlatFileLoaderTest { private String enrichmentJson = "enrichment_config.json"; private String log4jProperty = "log4j"; + private TestingServer testZkServer; + private String zookeeperUrl; + Configuration config = null; /** { @@ -84,6 +88,8 @@ public void setup() throws Exception { config = kv.getValue(); testUtil = kv.getKey(); testTable = testUtil.createTable(Bytes.toBytes(tableName), Bytes.toBytes(cf)); + testZkServer = new TestingServer(true); + zookeeperUrl = testZkServer.getConnectString(); } @After @@ -161,10 +167,11 @@ protected ExtractorState initialValue() { * "indicator" : "TO_UPPER(indicator)" * }, * "value_filter" : "LENGTH(domain) > 0", - * "indicator_filter" : "LENGTH(domain) > 0", + * "indicator_filter" : "LENGTH(indicator) > 0", * "indicator_column" : "domain", * "type" : "topdomain", - * "separator" : "," + * "separator" : ",", + * "zk_quorum" : "%ZK_QUORUM%" * }, * "extractor" : "CSV" *} @@ -175,13 +182,15 @@ protected ExtractorState initialValue() { @Test public void transforms_fields() throws Exception { Assert.assertNotNull(testTable); + // TODO +// ConfigurationsUtils.writeGlobalConfigToZookeeper(globalConfig, zookeeperUrl); String[] contents = new String[]{ "1,google.com", "2," }; EnrichmentConverter converter = new EnrichmentConverter(); - ExtractorHandler handler = ExtractorHandler.load(stellarExtractorConfig); + ExtractorHandler handler = ExtractorHandler.load(stellarExtractorConfig.replaceAll("%ZK_QUORUM", zookeeperUrl)); Extractor e = handler.getExtractor(); SimpleEnrichmentFlatFileLoader loader = new SimpleEnrichmentFlatFileLoader(); List extract = loader.extract(contents[0], e, cf, converter); From d9bb54ec27a0f3282d28ba40d043f0045c167a54 Mon Sep 17 00:00:00 2001 From: Michael Miklavcic Date: Thu, 2 Feb 2017 14:47:08 -0700 Subject: [PATCH 23/32] Add negative test cases. Refactor options as enum in extractor decorator --- .../TransformFilterExtractorDecorator.java | 57 ++++-- ...TransformFilterExtractorDecoratorTest.java | 166 +++++++++++++----- 2 files changed, 164 insertions(+), 59 deletions(-) diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java index fc05ee126f..5073a425a8 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java @@ -18,14 +18,35 @@ import java.io.IOException; import java.util.*; +import static org.apache.metron.dataloads.extractor.TransformFilterExtractorDecorator.ExtractorOptions.*; + public class TransformFilterExtractorDecorator extends ExtractorDecorator { private static final Logger LOG = Logger.getLogger(TransformFilterExtractorDecorator.class); - private static final String VALUE_TRANSFORM = "value_transform"; - private static final String VALUE_FILTER = "value_filter"; - private static final String INDICATOR_TRANSFORM = "indicator_transform"; - private static final String INDICATOR_FILTER = "indicator_filter"; - private static final String ZK_QUORUM = "zk_quorum"; - private static final String INDICATOR = "indicator"; + + protected enum ExtractorOptions { + VALUE_TRANSFORM("value_transform"), + VALUE_FILTER("value_filter"), + INDICATOR_TRANSFORM("indicator_transform"), + INDICATOR_FILTER("indicator_filter"), + ZK_QUORUM("zk_quorum"), + INDICATOR("indicator"); + + private String key; + + ExtractorOptions(String key) { + this.key = key; + } + + @Override + public String toString() { + return key; + } + + public boolean existsIn(Map config) { + return config.containsKey(key); + } + } + private Optional zkClient; private Map valueTransforms; private Map indicatorTransforms; @@ -48,21 +69,21 @@ public TransformFilterExtractorDecorator(Extractor decoratedExtractor) { @Override public void initialize(Map config) { super.initialize(config); - if (config.containsKey(VALUE_TRANSFORM)) { - this.valueTransforms = getTransforms(config, VALUE_TRANSFORM); + if (VALUE_TRANSFORM.existsIn(config)) { + this.valueTransforms = getTransforms(config, VALUE_TRANSFORM.toString()); } - if (config.containsKey(INDICATOR_TRANSFORM)) { - this.indicatorTransforms = getTransforms(config, INDICATOR_TRANSFORM); + if (INDICATOR_TRANSFORM.existsIn(config)) { + this.indicatorTransforms = getTransforms(config, INDICATOR_TRANSFORM.toString()); } - if (config.containsKey(VALUE_FILTER)) { - this.valueFilter = getFilter(config, VALUE_FILTER); + if (VALUE_FILTER.existsIn(config)) { + this.valueFilter = getFilter(config, VALUE_FILTER.toString()); } - if (config.containsKey(INDICATOR_FILTER)) { - this.indicatorFilter = getFilter(config, INDICATOR_FILTER); + if (INDICATOR_FILTER.existsIn(config)) { + this.indicatorFilter = getFilter(config, INDICATOR_FILTER.toString()); } String zkClientUrl = ""; - if (config.containsKey(ZK_QUORUM)) { - zkClientUrl = ConversionUtils.convert(config.get(ZK_QUORUM), String.class); + if (ZK_QUORUM.existsIn(config)) { + zkClientUrl = ConversionUtils.convert(config.get(ZK_QUORUM.toString()), String.class); } zkClient = setupClient(zkClient, zkClientUrl); this.globalConfig = getGlobalConfig(zkClient); @@ -157,12 +178,12 @@ private boolean updateLookupKV(LookupKV lkv) { Map ind = new LinkedHashMap<>(); String indicator = lkv.getKey().getIndicator(); // add indicator as a resolvable variable. Also enable using resolved/transformed variables and values from operating on the value metadata - ind.put(INDICATOR, indicator); + ind.put(INDICATOR.toString(), indicator); MapVariableResolver resolver = new MapVariableResolver(ret, ind, globalConfig); transform(valueTransforms, ret, resolver); transform(indicatorTransforms, ind, resolver); // update indicator - Object updatedIndicator = ind.get(INDICATOR); + Object updatedIndicator = ind.get(INDICATOR.toString()); if (updatedIndicator != null) { if (!(updatedIndicator instanceof String)) { throw new UnsupportedOperationException("Indicator transform must return String type"); diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecoratorTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecoratorTest.java index e3c58d325b..effb1b293a 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecoratorTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecoratorTest.java @@ -6,12 +6,12 @@ import org.apache.metron.enrichment.converter.EnrichmentKey; import org.apache.metron.enrichment.converter.EnrichmentValue; import org.apache.metron.enrichment.lookup.LookupKV; -import org.apache.metron.enrichment.lookup.LookupKey; -import org.apache.metron.enrichment.lookup.LookupValue; import org.hamcrest.CoreMatchers; import org.junit.Assert; import org.junit.Before; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.MockitoAnnotations; @@ -21,72 +21,156 @@ public class TransformFilterExtractorDecoratorTest { - LinkedHashMap config1; @Mock CuratorFramework zkClient; @Mock Extractor extractor; + LinkedHashMap config1; + TransformFilterExtractorDecorator decorator; @Before public void setup() throws Exception { MockitoAnnotations.initMocks(this); config1 = new ObjectMapper().readValue(config1Contents, LinkedHashMap.class); -// HashMap config2 = new ObjectMapper().readValue(config1Contents, HashMap.class); + decorator = new TransformFilterExtractorDecorator(extractor); + decorator.setZkClient(Optional.of(zkClient)); + decorator.initialize(config1); } /** - * { - * "zk_quorum" : "blah", - * "columns" : { - * "foo" : 0, - * "bar" : 1, - * "baz" : 2 - * }, - * "value_transform" : { - * "foo" : "TO_UPPER(foo)", - * "newvar" : "foo", - * "lowernewvar" : "TO_LOWER(newvar)" - * }, - * "value_filter" : "LENGTH(baz) > 0", - * "indicator_column" : "bar", - * "indicator_transform" : { - * "somevar" : "indicator", - * "indicator" : "TO_UPPER(somevar)" - * }, - * "indicator_filter" : "LENGTH(indicator) > 0", - * "type" : "testenrichment", - * "separator" : "," - * } + *{ + * "zk_quorum" : "blah", + * "columns" : { + * "foo" : 0, + * "bar" : 1, + * "baz" : 2 + * }, + * "value_transform" : { + * "foo" : "TO_UPPER(foo)", + * "newvar" : "foo", + * "lowernewvar" : "TO_LOWER(newvar)" + * }, + * "value_filter" : "LENGTH(baz) > 0", + * "indicator_column" : "bar", + * "indicator_transform" : { + * "somevar" : "indicator", + * "indicator" : "TO_UPPER(somevar)" + * }, + * "indicator_filter" : "LENGTH(indicator) > 0", + * "type" : "testenrichment", + * "separator" : "," *} */ @Multiline public static String config1Contents; @Test - public void simple_transform_value() throws IOException { - LookupKey lookupKey = new EnrichmentKey("testenrichment", "val2"); - LookupValue lookupValue = new EnrichmentValue(new HashMap() {{ + public void transforms_values_and_indicators() throws IOException { + final String indicatorVal = "val2"; + EnrichmentKey lookupKey = new EnrichmentKey("testenrichment", indicatorVal); + EnrichmentValue lookupValue = new EnrichmentValue(new HashMap() {{ put("foo", "val1"); + put("bar", indicatorVal); + put("baz", "val3"); + }}); + LookupKV lkv = new LookupKV<>(lookupKey, lookupValue); + List extractedLkvs = new ArrayList<>(); + extractedLkvs.add(lkv); + Mockito.when(extractor.extract("val1,val2,val3")).thenReturn(extractedLkvs); + Iterable extracted = decorator.extract("val1,val2,val3"); + + EnrichmentKey expectedLookupKey = new EnrichmentKey("testenrichment", "VAL2"); + EnrichmentValue expectedLookupValue = new EnrichmentValue(new HashMap() {{ + put("foo", "VAL1"); put("bar", "val2"); put("baz", "val3"); + put("newvar", "VAL1"); + put("lowernewvar", "val1"); + }}); + LookupKV expectedLkv = new LookupKV<>(expectedLookupKey, expectedLookupValue); + List expectedLkvs = new ArrayList<>(); + expectedLkvs.add(expectedLkv); + Assert.assertThat(extracted, CoreMatchers.equalTo(expectedLkvs)); + } + + @Test + public void filters_values() throws Exception { + final String indicatorVal = "val2"; + EnrichmentKey lookupKey = new EnrichmentKey("testenrichment", indicatorVal); + EnrichmentValue lookupValue = new EnrichmentValue(new HashMap() {{ + put("foo", "val1"); + put("bar", indicatorVal); + put("baz", ""); + }}); + LookupKV lkv = new LookupKV<>(lookupKey, lookupValue); + List extractedLkvs = new ArrayList<>(); + extractedLkvs.add(lkv); + Mockito.when(extractor.extract("val1,val2,")).thenReturn(extractedLkvs); + Iterable extracted = decorator.extract("val1,val2,"); + Assert.assertThat(extracted, CoreMatchers.equalTo(new ArrayList<>())); + } + + @Test + public void filters_indicators() throws Exception { + EnrichmentKey lookupKey = new EnrichmentKey("testenrichment", ""); + EnrichmentValue lookupValue = new EnrichmentValue(new HashMap() {{ + put("foo", "val1"); + put("bar", ""); + put("baz", "val3"); }}); - LookupKV lkv = new LookupKV(lookupKey, lookupValue); - List lkvs = new ArrayList<>(); - lkvs.add(lkv); - Mockito.when(extractor.extract("val1,val2,val3")).thenReturn(lkvs); - TransformFilterExtractorDecorator decorator = new TransformFilterExtractorDecorator(extractor); + LookupKV lkv = new LookupKV<>(lookupKey, lookupValue); + List extractedLkvs = new ArrayList<>(); + extractedLkvs.add(lkv); + Mockito.when(extractor.extract("val1,,val3")).thenReturn(extractedLkvs); + Iterable extracted = decorator.extract("val1,,val3"); + Assert.assertThat(extracted, CoreMatchers.equalTo(new ArrayList<>())); + } + + @Rule + public ExpectedException exception = ExpectedException.none(); + + @Test + public void bad_value_transform_causes_exception() throws Exception { + final int badValue = 5; + exception.expect(IllegalArgumentException.class); + exception.expectMessage("Object is not of type java.util.Map"); + config1.put(TransformFilterExtractorDecorator.ExtractorOptions.VALUE_TRANSFORM.toString(), badValue); + decorator = new TransformFilterExtractorDecorator(extractor); decorator.setZkClient(Optional.of(zkClient)); decorator.initialize(config1); - Iterable extracted = decorator.extract("val1,val2,val3"); - Assert.assertThat(extracted, CoreMatchers.equalTo(lkvs)); } - // TODO - - // simple filter value + @Test + public void bad_value_filter_causes_exception() throws Exception { + final int badValue = 5; + exception.expect(IllegalArgumentException.class); + exception.expectMessage("Object is not of type java.lang.String"); + config1.put(TransformFilterExtractorDecorator.ExtractorOptions.VALUE_FILTER.toString(), badValue); + decorator = new TransformFilterExtractorDecorator(extractor); + decorator.setZkClient(Optional.of(zkClient)); + decorator.initialize(config1); + } - // simple transform indicator + @Test + public void bad_indicator_transform_causes_exception() throws Exception { + final int badValue = 5; + exception.expect(IllegalArgumentException.class); + exception.expectMessage("Object is not of type java.util.Map"); + config1.put(TransformFilterExtractorDecorator.ExtractorOptions.INDICATOR_TRANSFORM.toString(), badValue); + decorator = new TransformFilterExtractorDecorator(extractor); + decorator.setZkClient(Optional.of(zkClient)); + decorator.initialize(config1); + } - // simple filter indicator + @Test + public void bad_indicator_filter_causes_exception() throws Exception { + final int badValue = 5; + exception.expect(IllegalArgumentException.class); + exception.expectMessage("Object is not of type java.lang.String"); + config1.put(TransformFilterExtractorDecorator.ExtractorOptions.INDICATOR_FILTER.toString(), badValue); + decorator = new TransformFilterExtractorDecorator(extractor); + decorator.setZkClient(Optional.of(zkClient)); + decorator.initialize(config1); + } } From eafc786250d9b8e6283bd71c91bbd270ba4d1311 Mon Sep 17 00:00:00 2001 From: Michael Miklavcic Date: Fri, 3 Feb 2017 11:52:03 -0700 Subject: [PATCH 24/32] Get integration tests for flat file loader working with my branch. Fix trampled commit for ExtractorHandler --- .../dataloads/extractor/ExtractorHandler.java | 10 +- ...richmentFlatFileLoaderIntegrationTest.java | 25 ++- .../SimpleEnrichmentFlatFileLoaderTest.java | 211 ------------------ 3 files changed, 20 insertions(+), 226 deletions(-) delete mode 100644 metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorHandler.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorHandler.java index 0bdd44a177..a9df2fdd4f 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorHandler.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorHandler.java @@ -32,7 +32,7 @@ public class ExtractorHandler { final static ObjectMapper _mapper = new ObjectMapper(); private Map config; private Extractor extractor; - private InputFormatHandler inputFormatHandler = Formats.BY_LINE; + private InputFormatHandler inputFormat = Formats.BY_LINE; public Map getConfig() { return config; @@ -45,16 +45,16 @@ public void setConfig(Map config) { this.config = config; } - public InputFormatHandler getInputFormatHandler() { - return inputFormatHandler; + public InputFormatHandler getInputFormat() { + return inputFormat; } /** * Set by jackson */ - public void setInputFormatHandler(String handler) { + public void setInputFormat(String handler) { try { - this.inputFormatHandler = Formats.create(handler); + this.inputFormat = Formats.create(handler); } catch (ClassNotFoundException | InstantiationException | IllegalAccessException | NoSuchMethodException | InvocationTargetException e) { throw new IllegalStateException("Unable to create an inputformathandler", e); } diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java index b2777bb4f1..acfa9103e5 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java @@ -55,6 +55,9 @@ import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; +import static org.hamcrest.CoreMatchers.equalTo; +import static org.hamcrest.CoreMatchers.startsWith; + public class SimpleEnrichmentFlatFileLoaderIntegrationTest { private static HBaseTestingUtility testUtil; @@ -120,7 +123,7 @@ public class SimpleEnrichmentFlatFileLoaderIntegrationTest { * "config" : { * "columns" : { * "host" : 0, - * "meta" : 1 + * "meta" : 2 * }, * "value_transform" : { * "host" : "TO_UPPER(host)" @@ -133,7 +136,8 @@ public class SimpleEnrichmentFlatFileLoaderIntegrationTest { * "indicator_filter" : "LENGTH(indicator) > 0", * "type" : "enrichment", * "separator" : "," - * } + * }, + * "extractor" : "CSV" *} */ @Multiline @@ -169,9 +173,9 @@ public static void setup() throws Exception { if(stellarExtractorConfigFile.exists()) { stellarExtractorConfigFile.delete(); } - Files.write(stellarExtractorConfigFile.toPath() - , stellarExtractorConfig.getBytes() - , StandardOpenOption.CREATE_NEW , StandardOpenOption.TRUNCATE_EXISTING + Files.write( stellarExtractorConfigFile.toPath() + , stellarExtractorConfig.getBytes() + , StandardOpenOption.CREATE_NEW , StandardOpenOption.TRUNCATE_EXISTING ); if(file1.exists()) { file1.delete(); @@ -231,6 +235,7 @@ public static void teardown() throws Exception { multilineZipFile.delete(); lineByLineExtractorConfigFile.delete(); wholeFileExtractorConfigFile.delete(); + stellarExtractorConfigFile.delete(); } @@ -392,11 +397,11 @@ public void transforms_fields() throws Exception { testTable.delete(new Delete(r.getRow())); } Assert.assertEquals(NUM_LINES, results.size()); - Assert.assertTrue(results.get(0).getKey().indicator.startsWith("google")); - Assert.assertEquals(results.get(0).getKey().type, "enrichment"); - Assert.assertEquals(results.get(0).getValue().getMetadata().size(), 2); - Assert.assertTrue(results.get(0).getValue().getMetadata().get("meta").toString().startsWith("foo")); - Assert.assertTrue(results.get(0).getValue().getMetadata().get("host").toString().startsWith("GOOGLE")); + Assert.assertThat(results.get(0).getKey().getIndicator(), startsWith("GOOGLE")); + Assert.assertThat(results.get(0).getKey().type, equalTo("enrichment")); + Assert.assertThat(results.get(0).getValue().getMetadata().size(), equalTo(2)); + Assert.assertThat(results.get(0).getValue().getMetadata().get("meta").toString(), startsWith("foo")); + Assert.assertThat(results.get(0).getValue().getMetadata().get("host").toString(), startsWith("GOOGLE")); } } diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java deleted file mode 100644 index 15e0384562..0000000000 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderTest.java +++ /dev/null @@ -1,211 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.metron.dataloads.nonbulk.flatfile; - -import com.google.common.collect.ImmutableList; -import org.adrianwalker.multilinestring.Multiline; -import org.apache.commons.cli.CommandLine; -import org.apache.commons.cli.PosixParser; -import org.apache.curator.test.TestingServer; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.HBaseTestingUtility; -import org.apache.hadoop.hbase.client.HTable; -import org.apache.hadoop.hbase.client.Put; -import org.apache.hadoop.hbase.client.Result; -import org.apache.hadoop.hbase.client.ResultScanner; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.util.GenericOptionsParser; -import org.apache.metron.dataloads.extractor.Extractor; -import org.apache.metron.dataloads.extractor.ExtractorHandler; -import org.apache.metron.dataloads.hbase.mr.HBaseUtil; -import org.apache.metron.enrichment.converter.EnrichmentConverter; -import org.apache.metron.enrichment.converter.EnrichmentKey; -import org.apache.metron.enrichment.converter.EnrichmentValue; -import org.apache.metron.enrichment.lookup.LookupKV; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.stream.Stream; - -public class SimpleEnrichmentFlatFileLoaderTest { - - private HBaseTestingUtility testUtil; - - /** The test table. */ - private HTable testTable; - private String tableName = "enrichment"; - private String cf = "cf"; - private String csvFile="input.csv"; - private String extractorJson = "extractor.json"; - private String enrichmentJson = "enrichment_config.json"; - private String log4jProperty = "log4j"; - - private TestingServer testZkServer; - private String zookeeperUrl; - - Configuration config = null; - /** - { - "config" : { - "columns" : { - "host" : 0, - "meta" : 2 - }, - "indicator_column" : "host", - "separator" : ",", - "type" : "enrichment" - }, - "extractor" : "CSV" - } - */ - @Multiline - private static String extractorConfig; - - @Before - public void setup() throws Exception { - Map.Entry kv = HBaseUtil.INSTANCE.create(true); - config = kv.getValue(); - testUtil = kv.getKey(); - testTable = testUtil.createTable(Bytes.toBytes(tableName), Bytes.toBytes(cf)); - testZkServer = new TestingServer(true); - zookeeperUrl = testZkServer.getConnectString(); - } - - @After - public void teardown() throws Exception { - HBaseUtil.INSTANCE.teardown(testUtil); - } - - @Test - public void testCommandLine() throws Exception { - Configuration conf = HBaseConfiguration.create(); - - String[] argv = { "-c cf", "-t enrichment" - , "-e extractor.json", "-n enrichment_config.json" - , "-l log4j", "-i input.csv" - , "-p 2", "-b 128" - }; - String[] otherArgs = new GenericOptionsParser(conf, argv).getRemainingArgs(); - - CommandLine cli = SimpleEnrichmentFlatFileLoader.LoadOptions.parse(new PosixParser(), otherArgs); - Assert.assertEquals(extractorJson,SimpleEnrichmentFlatFileLoader.LoadOptions.EXTRACTOR_CONFIG.get(cli).trim()); - Assert.assertEquals(cf, SimpleEnrichmentFlatFileLoader.LoadOptions.HBASE_CF.get(cli).trim()); - Assert.assertEquals(tableName,SimpleEnrichmentFlatFileLoader.LoadOptions.HBASE_TABLE.get(cli).trim()); - Assert.assertEquals(enrichmentJson,SimpleEnrichmentFlatFileLoader.LoadOptions.ENRICHMENT_CONFIG.get(cli).trim()); - Assert.assertEquals(csvFile,SimpleEnrichmentFlatFileLoader.LoadOptions.INPUT.get(cli).trim()); - Assert.assertEquals(log4jProperty, SimpleEnrichmentFlatFileLoader.LoadOptions.LOG4J_PROPERTIES.get(cli).trim()); - Assert.assertEquals("2", SimpleEnrichmentFlatFileLoader.LoadOptions.NUM_THREADS.get(cli).trim()); - Assert.assertEquals("128", SimpleEnrichmentFlatFileLoader.LoadOptions.BATCH_SIZE.get(cli).trim()); - } - - @Test - public void test() throws Exception { - Assert.assertNotNull(testTable); - String contents = "google.com,1,foo"; - - EnrichmentConverter converter = new EnrichmentConverter(); - ExtractorHandler handler = ExtractorHandler.load(extractorConfig); - Extractor e = handler.getExtractor(); - SimpleEnrichmentFlatFileLoader loader = new SimpleEnrichmentFlatFileLoader(); - Stream contentStreams = ImmutableList.of(contents).stream(); - ThreadLocal state = new ThreadLocal() { - @Override - protected ExtractorState initialValue() { - return new ExtractorState(testTable, e, converter); - } - }; - loader.load(ImmutableList.of(contentStreams) - , state - , cf - , 2 - ); - - ResultScanner scanner = testTable.getScanner(Bytes.toBytes(cf)); - List> results = new ArrayList<>(); - for(Result r : scanner) { - results.add(converter.fromResult(r, cf)); - } - Assert.assertEquals(1, results.size()); - Assert.assertEquals(results.get(0).getKey().indicator, "google.com"); - Assert.assertEquals(results.get(0).getKey().type, "enrichment"); - Assert.assertEquals(results.get(0).getValue().getMetadata().size(), 2); - Assert.assertEquals(results.get(0).getValue().getMetadata().get("meta"), "foo"); - Assert.assertEquals(results.get(0).getValue().getMetadata().get("host"), "google.com"); - } - - /** - *{ - * "config" : { - * "columns" : { - * "domain" : 1 - * }, - * "value_transform" : { - * "domain" : "TO_UPPER(domain)" - * }, - * "indicator_transform" : { - * "indicator" : "TO_UPPER(indicator)" - * }, - * "value_filter" : "LENGTH(domain) > 0", - * "indicator_filter" : "LENGTH(indicator) > 0", - * "indicator_column" : "domain", - * "type" : "topdomain", - * "separator" : ",", - * "zk_quorum" : "%ZK_QUORUM%" - * }, - * "extractor" : "CSV" - *} - */ - @Multiline - private static String stellarExtractorConfig; - - @Test - public void transforms_fields() throws Exception { - Assert.assertNotNull(testTable); - // TODO -// ConfigurationsUtils.writeGlobalConfigToZookeeper(globalConfig, zookeeperUrl); - String[] contents = new String[]{ - "1,google.com", - "2," - }; - - EnrichmentConverter converter = new EnrichmentConverter(); - ExtractorHandler handler = ExtractorHandler.load(stellarExtractorConfig.replaceAll("%ZK_QUORUM", zookeeperUrl)); - Extractor e = handler.getExtractor(); - SimpleEnrichmentFlatFileLoader loader = new SimpleEnrichmentFlatFileLoader(); - List extract = loader.extract(contents[0], e, cf, converter); - testTable.put(extract); - extract = loader.extract(contents[1], e, cf, converter); - testTable.put(extract); - - ResultScanner scanner = testTable.getScanner(Bytes.toBytes(cf)); - List> results = new ArrayList<>(); - for (Result r : scanner) { - results.add(converter.fromResult(r, cf)); - } - Assert.assertEquals(results.get(0).getKey().type, "topdomain"); - Assert.assertEquals(results.get(0).getKey().getIndicator(), "GOOGLE.COM"); - Assert.assertEquals(results.get(0).getValue().getMetadata().size(), 1); - Assert.assertEquals(results.get(0).getValue().getMetadata().get("domain"), "GOOGLE.COM"); - } -} From ad1aef760948109565b7144479151312ebccc24d Mon Sep 17 00:00:00 2001 From: Michael Miklavcic Date: Fri, 3 Feb 2017 12:46:05 -0700 Subject: [PATCH 25/32] Get integration tests working for Stellar transformations in the file loader --- ...richmentFlatFileLoaderIntegrationTest.java | 39 ++++++++++++++++--- 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java index acfa9103e5..4475037b39 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java @@ -20,6 +20,7 @@ import org.adrianwalker.multilinestring.Multiline; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.PosixParser; +import org.apache.curator.framework.CuratorFramework; import org.apache.curator.test.TestingServer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -31,6 +32,7 @@ import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.metron.common.configuration.ConfigurationsUtils; import org.apache.metron.dataloads.hbase.mr.HBaseUtil; import org.apache.metron.enrichment.converter.EnrichmentConverter; import org.apache.metron.enrichment.converter.EnrichmentKey; @@ -65,8 +67,9 @@ public class SimpleEnrichmentFlatFileLoaderIntegrationTest { /** The test table. */ private static HTable testTable; private static Configuration config = null; - private TestingServer testZkServer; - private String zookeeperUrl; + private static TestingServer testZkServer; + private static String zookeeperUrl; + private static CuratorFramework client; private static final String tableName = "enrichment"; private static final String cf = "cf"; private static final String csvFile="input.csv"; @@ -83,6 +86,14 @@ public class SimpleEnrichmentFlatFileLoaderIntegrationTest { private static final File stellarExtractorConfigFile = new File("target/sefflt_extractorConfig_stellar.json"); private static final int NUM_LINES = 1000; + /** + * { + * "enrichment_property" : "valfromglobalconfig" + * } + */ + @Multiline + public static String globalConfig; + /** { "config" : { @@ -121,12 +132,15 @@ public class SimpleEnrichmentFlatFileLoaderIntegrationTest { /** *{ * "config" : { + * "zk_quorum" : "%ZK_QUORUM%", * "columns" : { * "host" : 0, + * "empty" : 1, * "meta" : 2 * }, * "value_transform" : { - * "host" : "TO_UPPER(host)" + * "host" : "TO_UPPER(host)", + * "empty" : "enrichment_property" * }, * "value_filter" : "LENGTH(host) > 0", * "indicator_column" : "host", @@ -150,6 +164,8 @@ public static void setup() throws Exception { config = kv.getValue(); testUtil = kv.getKey(); testTable = testUtil.createTable(Bytes.toBytes(tableName), Bytes.toBytes(cf)); + zookeeperUrl = getZookeeperUrl(config.get("hbase.zookeeper.quorum"), testUtil.getZkCluster().getClientPort()); + setupGlobalConfig(zookeeperUrl); for(Result r : testTable.getScanner(Bytes.toBytes(cf))) { Delete d = new Delete(r.getRow()); @@ -174,7 +190,7 @@ public static void setup() throws Exception { stellarExtractorConfigFile.delete(); } Files.write( stellarExtractorConfigFile.toPath() - , stellarExtractorConfig.getBytes() + , stellarExtractorConfig.replace("%ZK_QUORUM%", zookeeperUrl).getBytes() , StandardOpenOption.CREATE_NEW , StandardOpenOption.TRUNCATE_EXISTING ); if(file1.exists()) { @@ -225,6 +241,16 @@ public static void setup() throws Exception { } + private static String getZookeeperUrl(String host, int port) { + return host + ":" + port; + } + + private static void setupGlobalConfig(String zookeeperUrl) throws Exception { + client = ConfigurationsUtils.getClient(zookeeperUrl); + client.start(); + ConfigurationsUtils.writeGlobalConfigToZookeeper(globalConfig.getBytes(), zookeeperUrl); + } + @AfterClass public static void teardown() throws Exception { HBaseUtil.INSTANCE.teardown(testUtil); @@ -382,7 +408,7 @@ public void testMRLineByLine() throws Exception { } @Test - public void transforms_fields() throws Exception { + public void stellar_transforms_and_filters_indicators_and_value_metadata() throws Exception { String[] argv = {"-c cf", "-t enrichment" , "-e " + stellarExtractorConfigFile.getPath() , "-i " + multilineFile.getPath() @@ -399,8 +425,9 @@ public void transforms_fields() throws Exception { Assert.assertEquals(NUM_LINES, results.size()); Assert.assertThat(results.get(0).getKey().getIndicator(), startsWith("GOOGLE")); Assert.assertThat(results.get(0).getKey().type, equalTo("enrichment")); - Assert.assertThat(results.get(0).getValue().getMetadata().size(), equalTo(2)); + Assert.assertThat(results.get(0).getValue().getMetadata().size(), equalTo(3)); Assert.assertThat(results.get(0).getValue().getMetadata().get("meta").toString(), startsWith("foo")); + Assert.assertThat(results.get(0).getValue().getMetadata().get("empty").toString(), startsWith("valfromglobalconfig")); Assert.assertThat(results.get(0).getValue().getMetadata().get("host").toString(), startsWith("GOOGLE")); } From 799811ccd587d9fcd2135ea3970d5bbf1121a29b Mon Sep 17 00:00:00 2001 From: cstella Date: Fri, 3 Feb 2017 17:21:34 -0500 Subject: [PATCH 26/32] Reacted to @mmiklavcic --- .../common/utils/cli}/OptionHandler.java | 6 ++--- .../nonbulk/flatfile/LoadOptions.java | 27 ++++++++++--------- .../SimpleEnrichmentFlatFileLoader.java | 1 + .../flatfile/importer/MapReduceImporter.java | 5 +++- 4 files changed, 22 insertions(+), 17 deletions(-) rename metron-platform/{metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile => metron-common/src/main/java/org/apache/metron/common/utils/cli}/OptionHandler.java (82%) diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/OptionHandler.java b/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/cli/OptionHandler.java similarity index 82% rename from metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/OptionHandler.java rename to metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/cli/OptionHandler.java index 30620b10d9..85e752018c 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/OptionHandler.java +++ b/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/cli/OptionHandler.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.metron.dataloads.nonbulk.flatfile; +package org.apache.metron.common.utils.cli; import com.google.common.base.Function; import org.apache.commons.cli.CommandLine; @@ -23,9 +23,9 @@ import java.util.Optional; -public abstract class OptionHandler implements Function +public abstract class OptionHandler> implements Function { - public Optional getValue(LoadOptions option, CommandLine cli) { + public Optional getValue(OPT_T option, CommandLine cli) { return Optional.empty(); } } diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java index 76a9b2f2af..ddaf6a6528 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/LoadOptions.java @@ -22,6 +22,7 @@ import org.apache.commons.cli.*; import org.apache.commons.io.FileUtils; import org.apache.metron.common.utils.ConversionUtils; +import org.apache.metron.common.utils.cli.OptionHandler; import org.apache.metron.dataloads.nonbulk.flatfile.importer.ImportStrategy; import javax.annotation.Nullable; @@ -33,7 +34,7 @@ import java.util.Optional; public enum LoadOptions { - HELP("h", new OptionHandler() { + HELP("h", new OptionHandler() { @Nullable @Override @@ -41,7 +42,7 @@ public Option apply(@Nullable String s) { return new Option(s, "help", false, "Generate Help screen"); } }) - ,QUIET("q", new OptionHandler() { + ,QUIET("q", new OptionHandler() { @Nullable @Override @@ -54,7 +55,7 @@ public Optional getValue(LoadOptions option, CommandLine cli) { return Optional.of(option.has(cli)); } }) - , IMPORT_MODE("m", new OptionHandler() { + , IMPORT_MODE("m", new OptionHandler() { @Nullable @Override public Option apply(@Nullable String s) { @@ -73,7 +74,7 @@ public Optional getValue(LoadOptions option, CommandLine cli) { return Optional.of(ImportStrategy.getStrategy(mode).orElse(ImportStrategy.LOCAL)); } }) - ,HBASE_TABLE("t", new OptionHandler() { + ,HBASE_TABLE("t", new OptionHandler() { @Nullable @Override public Option apply(@Nullable String s) { @@ -88,7 +89,7 @@ public Optional getValue(LoadOptions option, CommandLine cli) { return Optional.ofNullable(option.get(cli).trim()); } }) - ,HBASE_CF("c", new OptionHandler() { + ,HBASE_CF("c", new OptionHandler() { @Nullable @Override public Option apply(@Nullable String s) { @@ -103,7 +104,7 @@ public Optional getValue(LoadOptions option, CommandLine cli) { return Optional.ofNullable(option.get(cli).trim()); } }) - ,EXTRACTOR_CONFIG("e", new OptionHandler() { + ,EXTRACTOR_CONFIG("e", new OptionHandler() { @Nullable @Override public Option apply(@Nullable String s) { @@ -122,7 +123,7 @@ public Optional getValue(LoadOptions option, CommandLine cli) { } } }) - ,ENRICHMENT_CONFIG("n", new OptionHandler() { + ,ENRICHMENT_CONFIG("n", new OptionHandler() { @Nullable @Override public Option apply(@Nullable String s) { @@ -135,7 +136,7 @@ public Option apply(@Nullable String s) { return o; } }) - ,LOG4J_PROPERTIES("l", new OptionHandler() { + ,LOG4J_PROPERTIES("l", new OptionHandler() { @Nullable @Override public Option apply(@Nullable String s) { @@ -145,7 +146,7 @@ public Option apply(@Nullable String s) { return o; } }) - ,NUM_THREADS("p", new OptionHandler() { + ,NUM_THREADS("p", new OptionHandler() { @Nullable @Override public Option apply(@Nullable String s) { @@ -164,7 +165,7 @@ public Optional getValue(LoadOptions option, CommandLine cli) { return Optional.of(numThreads); } }) - ,BATCH_SIZE("b", new OptionHandler() { + ,BATCH_SIZE("b", new OptionHandler() { @Nullable @Override public Option apply(@Nullable String s) { @@ -183,7 +184,7 @@ public Optional getValue(LoadOptions option, CommandLine cli) { return Optional.of(batchSize); } }) - ,INPUT("i", new OptionHandler() { + ,INPUT("i", new OptionHandler() { @Nullable @Override public Option apply(@Nullable String s) { @@ -205,8 +206,8 @@ public Optional getValue(LoadOptions option, CommandLine cli) { ; Option option; String shortCode; - OptionHandler handler; - LoadOptions(String shortCode, OptionHandler optionHandler) { + OptionHandler handler; + LoadOptions(String shortCode, OptionHandler optionHandler) { this.shortCode = shortCode; this.handler = optionHandler; this.option = optionHandler.apply(shortCode); diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java index 926292a4f4..8ee11aaa18 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoader.java @@ -39,6 +39,7 @@ public static void main(String... argv) throws Exception { String[] otherArgs = new GenericOptionsParser(hadoopConfig, argv).getRemainingArgs(); main(hadoopConfig, otherArgs); } + public static void main(Configuration hadoopConfig, String[] argv) throws Exception { CommandLine cli = LoadOptions.parse(new PosixParser(), argv); diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/MapReduceImporter.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/MapReduceImporter.java index 6b96079b2e..e83bdd68c8 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/MapReduceImporter.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/nonbulk/flatfile/importer/MapReduceImporter.java @@ -23,6 +23,7 @@ import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableOutputFormat; import org.apache.hadoop.mapreduce.Job; +import org.apache.log4j.Logger; import org.apache.metron.dataloads.extractor.ExtractorHandler; import org.apache.metron.dataloads.hbase.mr.BulkLoadMapper; import org.apache.metron.dataloads.nonbulk.flatfile.LoadOptions; @@ -39,6 +40,8 @@ public enum MapReduceImporter implements Importer{ INSTANCE ; + private static final Logger LOG = Logger.getLogger(MapReduceImporter.class); + @Override public void importData(EnumMap> config , ExtractorHandler handler @@ -50,7 +53,7 @@ public void importData(EnumMap> config Job job = Job.getInstance(hadoopConfig); List inputs = (List) config.get(LoadOptions.INPUT).get(); job.setJobName("MapReduceImporter: " + inputs.stream().collect(Collectors.joining(",")) + " => " + table + ":" + cf); - System.out.println("Configuring " + job.getJobName()); + LOG.info("Configuring " + job.getJobName()); job.setJarByClass(MapReduceImporter.class); job.setMapperClass(org.apache.metron.dataloads.hbase.mr.BulkLoadMapper.class); job.setOutputFormatClass(TableOutputFormat.class); From d25dbc5b71a6b0412a86da89418aa33b2d5f2a98 Mon Sep 17 00:00:00 2001 From: cstella Date: Mon, 6 Feb 2017 09:55:16 -0500 Subject: [PATCH 27/32] Shaving off seconds for the integration tests. --- ...eastRecentlyUsedPrunerIntegrationTest.java | 35 +++++++++---------- .../nonbulk/taxii/TaxiiIntegrationTest.java | 13 +++---- 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/hbase/mr/LeastRecentlyUsedPrunerIntegrationTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/hbase/mr/LeastRecentlyUsedPrunerIntegrationTest.java index 14a5143b8b..d82be9d8f4 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/hbase/mr/LeastRecentlyUsedPrunerIntegrationTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/hbase/mr/LeastRecentlyUsedPrunerIntegrationTest.java @@ -36,10 +36,7 @@ import org.apache.metron.enrichment.lookup.accesstracker.BloomAccessTracker; import org.apache.metron.enrichment.lookup.accesstracker.PersistentAccessTracker; import org.apache.metron.test.utils.UnitTestHelper; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; +import org.junit.*; import java.util.ArrayList; import java.util.HashMap; @@ -49,21 +46,21 @@ public class LeastRecentlyUsedPrunerIntegrationTest { /** The test util. */ - private HBaseTestingUtility testUtil; + private static HBaseTestingUtility testUtil; /** The test table. */ - private HTable testTable; - private HTable atTable; - private String tableName = "malicious_domains"; - private String cf = "cf"; - private String atTableName = "access_trackers"; - private String atCF= "cf"; - private String beginTime = "04/14/2016 12:00:00"; - private String timeFormat = "georgia"; - private Configuration config = null; + private static HTable testTable; + private static HTable atTable; + private static final String tableName = "malicious_domains"; + private static final String cf = "cf"; + private static final String atTableName = "access_trackers"; + private static final String atCF= "cf"; + private static final String beginTime = "04/14/2016 12:00:00"; + private static final String timeFormat = "georgia"; + private static Configuration config = null; - @Before - public void setup() throws Exception { + @BeforeClass + public static void setup() throws Exception { UnitTestHelper.setJavaLoggingLevel(Level.SEVERE); Map.Entry kv = HBaseUtil.INSTANCE.create(true); config = kv.getValue(); @@ -71,10 +68,12 @@ public void setup() throws Exception { testTable = testUtil.createTable(Bytes.toBytes(tableName), Bytes.toBytes(cf)); atTable = testUtil.createTable(Bytes.toBytes(atTableName), Bytes.toBytes(atCF)); } - @After - public void teardown() throws Exception { + + @AfterClass + public static void teardown() throws Exception { HBaseUtil.INSTANCE.teardown(testUtil); } + public List getKeys(int start, int end) { List keys = new ArrayList<>(); for(int i = start;i < end;++i) { diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/taxii/TaxiiIntegrationTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/taxii/TaxiiIntegrationTest.java index 1cb58d864c..0223514bfa 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/taxii/TaxiiIntegrationTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/taxii/TaxiiIntegrationTest.java @@ -33,10 +33,7 @@ import org.apache.metron.enrichment.converter.EnrichmentValue; import org.apache.metron.test.mock.MockHTable; import org.apache.metron.enrichment.lookup.LookupKV; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; +import org.junit.*; import java.io.IOException; import java.util.HashSet; @@ -44,13 +41,13 @@ public class TaxiiIntegrationTest { - @Before - public void setup() throws IOException { + @BeforeClass + public static void setup() throws IOException { MockTaxiiService.start(8282); } - @After - public void teardown() { + @AfterClass + public static void teardown() { MockTaxiiService.shutdown(); MockHTable.Provider.clear(); } From c0b275bda2df4d007623b0555637867542e440ba Mon Sep 17 00:00:00 2001 From: cstella Date: Mon, 6 Feb 2017 09:58:04 -0500 Subject: [PATCH 28/32] whoops, missed one. --- .../metron/indexing/integration/IndexingIntegrationTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metron-platform/metron-indexing/src/test/java/org/apache/metron/indexing/integration/IndexingIntegrationTest.java b/metron-platform/metron-indexing/src/test/java/org/apache/metron/indexing/integration/IndexingIntegrationTest.java index a93c442ab0..ae04e43c59 100644 --- a/metron-platform/metron-indexing/src/test/java/org/apache/metron/indexing/integration/IndexingIntegrationTest.java +++ b/metron-platform/metron-indexing/src/test/java/org/apache/metron/indexing/integration/IndexingIntegrationTest.java @@ -175,8 +175,8 @@ public void test() throws Exception { .withComponent("config", configUploadComponent) .withComponent("storm", fluxComponent) .withComponent("search", getSearchComponent(topologyProperties)) - .withMillisecondsBetweenAttempts(15000) - .withNumRetries(10) + .withMillisecondsBetweenAttempts(1500) + .withNumRetries(100) .withMaxTimeMS(150000) .withCustomShutdownOrder(new String[] {"search","storm","config","kafka","zk"}) .build(); From c27b7830e93e14ada4891889291d2ea4caef0ee1 Mon Sep 17 00:00:00 2001 From: Michael Miklavcic Date: Mon, 6 Feb 2017 09:46:15 -0700 Subject: [PATCH 29/32] Add license headers to new files --- .../dataloads/extractor/ExtractorDecorator.java | 17 +++++++++++++++++ .../TransformFilterExtractorDecorator.java | 17 +++++++++++++++++ .../extractor/ExtractorDecoratorTest.java | 17 +++++++++++++++++ .../TransformFilterExtractorDecoratorTest.java | 17 +++++++++++++++++ 4 files changed, 68 insertions(+) diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorDecorator.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorDecorator.java index 0ac5527d89..bf42760aa3 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorDecorator.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/ExtractorDecorator.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.metron.dataloads.extractor; import org.apache.metron.enrichment.lookup.LookupKV; diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java index 5073a425a8..19de83e380 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.metron.dataloads.extractor; import com.fasterxml.jackson.core.type.TypeReference; diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/ExtractorDecoratorTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/ExtractorDecoratorTest.java index caa7e1d1d9..0526d012ae 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/ExtractorDecoratorTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/ExtractorDecoratorTest.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.metron.dataloads.extractor; import org.hamcrest.Matchers; diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecoratorTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecoratorTest.java index effb1b293a..80738b250e 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecoratorTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecoratorTest.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.metron.dataloads.extractor; import com.fasterxml.jackson.databind.ObjectMapper; From b73339f02c0a33fa84093f0f71431a975f636716 Mon Sep 17 00:00:00 2001 From: Michael Miklavcic Date: Tue, 7 Feb 2017 16:51:03 -0700 Subject: [PATCH 30/32] Add README info for loader Stellar transformations. Add integration test for custom extractor definition --- .../metron-data-management/README.md | 81 ++++++++++++++++++- .../dataloads/extractor/Extractors.java | 3 +- ...richmentFlatFileLoaderIntegrationTest.java | 62 +++++++++++++- 3 files changed, 141 insertions(+), 5 deletions(-) diff --git a/metron-platform/metron-data-management/README.md b/metron-platform/metron-data-management/README.md index eaafda4e74..dd793873aa 100644 --- a/metron-platform/metron-data-management/README.md +++ b/metron-platform/metron-data-management/README.md @@ -89,7 +89,7 @@ for the value will be 'meta'. For instance, given an input string of `123.45.12 would be extracted: * Indicator : `123.45.123.12` * Type : `malicious_ip` -* Value : `{ "source" : "the grapevine" }` +* Value : `{ "ip" : "123.45.123.12", "source" : "the grapevine" }` ### STIX Extractor @@ -127,6 +127,85 @@ addresses from the set of all possible addresses. Note that if no categories ar Also, only address and domain types allow filtering via `stix_address_categories` and `stix_domain_categories` config parameters. +### Common Extractor Properties + +Users also have the ability to transform and filter enrichment and threat intel data using Stellar as it is loaded into HBase. This feature is available to all extractor types. + +As an example, we will be providing a CSV list of top domains as an enrichment and filtering the value metadata, as well as the indicator column, with Stellar expressions. + +```` +{ + "config" : { + "zk_quorum" : "node1:2181", + "columns" : { + "rank" : 0, + "domain" : 1 + }, + "value_transform" : { + "domain" : "DOMAIN_REMOVE_TLD(domain)" + }, + "value_filter" : "LENGTH(domain) > 0", + "indicator_column" : "domain", + "indicator_transform" : { + "indicator" : "DOMAIN_REMOVE_TLD(indicator)" + }, + "indicator_filter" : "LENGTH(indicator) > 0", + "type" : "top_domains", + "separator" : "," + }, + "extractor" : "CSV" +} +```` + +There are 2 property maps that work with full Stellar expressions, and 2 properties that will work with Stellar predicates. + +| Property | Description | +|---------------------|-------------| +| value_transform | Transform fields defined in the "columns" mapping with Stellar transformations. New keys introduced in the transform will be added to the key metadata. | +| value_filter | Allows additional filtering with Stellar predicates based on results from the value transformations. In this example, records whose domain property is empty after removing the TLD will be omitted. | +| indicator_transform | Transform the indicator column independent of the value transformations. You can refer to the original indicator value by using "indicator" as the variable name, as shown in the example above. In addition, if you prefer to piggyback your transformations, you can refer to the variable "domain", which will allow your indicator transforms to inherit transformations done to this value during the value transformations. | +| indicator_filter | Allows additional filtering with Stellar predicates based on results from the value transformations. In this example, records whose indicator value is empty after removing the TLD will be omitted. | + +top-list.csv +```` +1,google.com +2,youtube.com +... +```` + +Running a file import with the above data and extractor configuration would result in the following 2 extracted data records: + +| Indicator | Type | Value | +|-----------|------|-------| +| google | top_domains | { "rank" : "1", "domain" : "google" } | +| yahoo | top_domains | { "rank" : "2", "domain" : "yahoo" } | + +Similar to the parser framework, providing a Zookeeper quorum via the zk_quorum property will enable Stellar to access properties that reside in the global config. +Expanding on our example above, if the global config looks as follows: +```` +{ + "global_property" : "metron-ftw" +} +```` + +And we expand our value_tranform: +```` +... + "value_transform" : { + "domain" : "DOMAIN_REMOVE_TLD(domain)", + "a-new-prop" : "global_property" + }, +... + +```` + +The resulting value data would look like the following: + +| Indicator | Type | Value | +|-----------|------|-------| +| google | top_domains | { "rank" : "1", "domain" : "google", "a-new-prop" : "metron-ftw" } | +| yahoo | top_domains | { "rank" : "2", "domain" : "yahoo", "a-new-prop" : "metron-ftw" } | + ## Enrichment Config In order to automatically add new enrichment and threat intel types to existing, running enrichment topologies, you will diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/Extractors.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/Extractors.java index 37693c18d8..93438d3bcc 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/Extractors.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/Extractors.java @@ -47,13 +47,12 @@ public Extractor create() { } public static Extractor create(String extractorName) throws ClassNotFoundException, IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException { try { - //TODO create decorated extractor here - in init method setup Stellar ExtractorCreator ec = Extractors.valueOf(extractorName); return new TransformFilterExtractorDecorator(ec.create()); } catch(IllegalArgumentException iae) { Extractor ex = (Extractor) Class.forName(extractorName).getConstructor().newInstance(); - return ex; + return new TransformFilterExtractorDecorator(ex); } } } diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java index 4475037b39..8f484ff643 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java @@ -33,6 +33,7 @@ import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.metron.common.configuration.ConfigurationsUtils; +import org.apache.metron.dataloads.extractor.csv.CSVExtractor; import org.apache.metron.dataloads.hbase.mr.HBaseUtil; import org.apache.metron.enrichment.converter.EnrichmentConverter; import org.apache.metron.enrichment.converter.EnrichmentKey; @@ -84,6 +85,7 @@ public class SimpleEnrichmentFlatFileLoaderIntegrationTest { private static final File lineByLineExtractorConfigFile = new File("target/sefflt_extractorConfig_lbl.json"); private static final File wholeFileExtractorConfigFile = new File("target/sefflt_extractorConfig_wf.json"); private static final File stellarExtractorConfigFile = new File("target/sefflt_extractorConfig_stellar.json"); + private static final File customLineByLineExtractorConfigFile = new File("target/sefflt_extractorConfig_custom.json"); private static final int NUM_LINES = 1000; /** @@ -140,7 +142,8 @@ public class SimpleEnrichmentFlatFileLoaderIntegrationTest { * }, * "value_transform" : { * "host" : "TO_UPPER(host)", - * "empty" : "enrichment_property" + * "empty" : "enrichment_property", + * "BLAH" : "enrichment_property" * }, * "value_filter" : "LENGTH(host) > 0", * "indicator_column" : "host", @@ -157,6 +160,31 @@ public class SimpleEnrichmentFlatFileLoaderIntegrationTest { @Multiline public static String stellarExtractorConfig; + /** + *{ + * "config" : { + * "columns" : { + * "host" : 0, + * "meta" : 2 + * }, + * "value_transform" : { + * "host" : "TO_UPPER(host)" + * }, + * "value_filter" : "LENGTH(host) > 0", + * "indicator_column" : "host", + * "indicator_transform" : { + * "indicator" : "TO_UPPER(indicator)" + * }, + * "indicator_filter" : "LENGTH(indicator) > 0", + * "type" : "enrichment", + * "separator" : "," + * }, + * "extractor" : "%EXTRACTOR_CLASS%" + *} + */ + @Multiline + private static String customLineByLineExtractorConfig; + @BeforeClass public static void setup() throws Exception { UnitTestHelper.setJavaLoggingLevel(Level.SEVERE); @@ -190,7 +218,14 @@ public static void setup() throws Exception { stellarExtractorConfigFile.delete(); } Files.write( stellarExtractorConfigFile.toPath() - , stellarExtractorConfig.replace("%ZK_QUORUM%", zookeeperUrl).getBytes() + , stellarExtractorConfig.replace("%ZK_QUORUM%", zookeeperUrl).getBytes() + , StandardOpenOption.CREATE_NEW , StandardOpenOption.TRUNCATE_EXISTING + ); + if(customLineByLineExtractorConfigFile.exists()) { + customLineByLineExtractorConfigFile.delete(); + } + Files.write( customLineByLineExtractorConfigFile.toPath() + , customLineByLineExtractorConfig.replace("%EXTRACTOR_CLASS%", CSVExtractor.class.getName()).getBytes() , StandardOpenOption.CREATE_NEW , StandardOpenOption.TRUNCATE_EXISTING ); if(file1.exists()) { @@ -431,4 +466,27 @@ public void stellar_transforms_and_filters_indicators_and_value_metadata() throw Assert.assertThat(results.get(0).getValue().getMetadata().get("host").toString(), startsWith("GOOGLE")); } + @Test + public void custom_extractor_transforms_and_filters_indicators_and_value_metadata() throws Exception { + String[] argv = {"-c cf", "-t enrichment" + , "-e " + customLineByLineExtractorConfigFile.getPath() + , "-i " + multilineFile.getPath() + , "-p 2", "-b 128", "-q" + }; + SimpleEnrichmentFlatFileLoader.main(config, argv); + EnrichmentConverter converter = new EnrichmentConverter(); + ResultScanner scanner = testTable.getScanner(Bytes.toBytes(cf)); + List> results = new ArrayList<>(); + for (Result r : scanner) { + results.add(converter.fromResult(r, cf)); + testTable.delete(new Delete(r.getRow())); + } + Assert.assertEquals(NUM_LINES, results.size()); + Assert.assertThat(results.get(0).getKey().getIndicator(), startsWith("GOOGLE")); + Assert.assertThat(results.get(0).getKey().type, equalTo("enrichment")); + Assert.assertThat(results.get(0).getValue().getMetadata().size(), equalTo(2)); + Assert.assertThat(results.get(0).getValue().getMetadata().get("meta").toString(), startsWith("foo")); + Assert.assertThat(results.get(0).getValue().getMetadata().get("host").toString(), startsWith("GOOGLE")); + } + } From b5cc03d9a1163eccdb74f6c4db76e1dd23ab9cc5 Mon Sep 17 00:00:00 2001 From: Michael Miklavcic Date: Tue, 7 Feb 2017 18:19:31 -0700 Subject: [PATCH 31/32] Make extractortest happy --- .../dataloads/extractor/TransformFilterExtractorDecorator.java | 2 ++ .../SimpleEnrichmentFlatFileLoaderIntegrationTest.java | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java index 19de83e380..b1190482a6 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java @@ -81,6 +81,8 @@ public TransformFilterExtractorDecorator(Extractor decoratedExtractor) { this.indicatorTransforms = new LinkedHashMap<>(); this.valueFilter = ""; this.indicatorFilter = ""; + this.transformProcessor = new StellarProcessor(); + this.filterProcessor = new StellarPredicateProcessor(); } @Override diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java index 8f484ff643..3911076b82 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java @@ -142,8 +142,7 @@ public class SimpleEnrichmentFlatFileLoaderIntegrationTest { * }, * "value_transform" : { * "host" : "TO_UPPER(host)", - * "empty" : "enrichment_property", - * "BLAH" : "enrichment_property" + * "empty" : "enrichment_property" * }, * "value_filter" : "LENGTH(host) > 0", * "indicator_column" : "host", From 03daced2b5ed8a01e741ab19f8982978d4ea314b Mon Sep 17 00:00:00 2001 From: Michael Miklavcic Date: Wed, 8 Feb 2017 11:55:51 -0700 Subject: [PATCH 32/32] Fix some issues and suggestions --- .../metron/common/utils/ConversionUtils.java | 8 -------- .../common/utils/ConversionUtilsTest.java | 18 ------------------ .../TransformFilterExtractorDecorator.java | 13 ++++++------- .../extractor/ExtractorDecoratorTest.java | 4 ++-- .../TransformFilterExtractorDecoratorTest.java | 12 ++++-------- ...nrichmentFlatFileLoaderIntegrationTest.java | 1 + 6 files changed, 13 insertions(+), 43 deletions(-) diff --git a/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/ConversionUtils.java b/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/ConversionUtils.java index a83a144b22..97b36ee265 100644 --- a/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/ConversionUtils.java +++ b/metron-platform/metron-common/src/main/java/org/apache/metron/common/utils/ConversionUtils.java @@ -35,14 +35,6 @@ protected ConvertUtilsBean initialValue() { } }; - public static T convertOrFail(Object o, Class clazz) { - if (clazz.isInstance(o)) { - return convert(o, clazz); - } else { - throw new IllegalArgumentException(String.format("Object is not of type %s", clazz.getCanonicalName())); - } - } - public static T convert(Object o, Class clazz) { if (o == null) { return null; diff --git a/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/ConversionUtilsTest.java b/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/ConversionUtilsTest.java index 06c767ab9e..7c825a183b 100644 --- a/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/ConversionUtilsTest.java +++ b/metron-platform/metron-common/src/test/java/org/apache/metron/common/utils/ConversionUtilsTest.java @@ -19,9 +19,7 @@ package org.apache.metron.common.utils; import org.junit.Assert; -import org.junit.Rule; import org.junit.Test; -import org.junit.rules.ExpectedException; public class ConversionUtilsTest { @@ -33,20 +31,4 @@ public void testIntegerConversions() { Assert.assertNull(ConversionUtils.convert("foo", Integer.class)); } - @Test - public void same_object_type_hierarchy_will_pass_convertOrFail() { - Assert.assertEquals(new Integer(5), ConversionUtils.convertOrFail(new Integer(5), Integer.class)); - Assert.assertEquals(new Integer(5), ConversionUtils.convertOrFail(new Integer(5), Number.class)); - } - - @Rule - public final ExpectedException exception = ExpectedException.none(); - - @Test - public void different_object_types_will_fail_convertOrFail() { - exception.expect(IllegalArgumentException.class); - exception.expectMessage("Object is not of type java.lang.String"); - ConversionUtils.convertOrFail(new Integer(5), String.class); - } - } diff --git a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java index b1190482a6..a1448d98f4 100644 --- a/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java +++ b/metron-platform/metron-data-management/src/main/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecorator.java @@ -113,7 +113,7 @@ public void initialize(Map config) { } private String getFilter(Map config, String valueFilter) { - return ConversionUtils.convertOrFail(config.get(valueFilter), String.class); + return (String) config.get(valueFilter); } /** @@ -123,12 +123,11 @@ private String getFilter(Map config, String valueFilter) { * @return map of transformations. */ private Map getTransforms(Map config, String type) { - Map transformsConfig = ConversionUtils.convertOrFail(config.get(type), Map.class); + // If this isn't a Map of Strings, let an exception be thrown + @SuppressWarnings("unchecked") Map transformsConfig = (Map) config.get(type); Map transforms = new LinkedHashMap<>(); for (Map.Entry e : transformsConfig.entrySet()) { - String key = ConversionUtils.convertOrFail(e.getKey(), String.class); - String val = ConversionUtils.convertOrFail(e.getValue(), String.class); - transforms.put(key, val); + transforms.put((String) e.getKey(), (String) e.getValue()); } return transforms; } @@ -170,9 +169,9 @@ private Map getGlobalConfig(Optional zkClient) private Context createContext(Optional zkClient) { Context.Builder builder = new Context.Builder(); if (zkClient.isPresent()) { - builder.with(Context.Capabilities.ZOOKEEPER_CLIENT, zkClient::get) - .with(Context.Capabilities.GLOBAL_CONFIG, () -> globalConfig); + builder.with(Context.Capabilities.ZOOKEEPER_CLIENT, zkClient::get); } + builder.with(Context.Capabilities.GLOBAL_CONFIG, () -> globalConfig); return builder.build(); } diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/ExtractorDecoratorTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/ExtractorDecoratorTest.java index 0526d012ae..93c809845e 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/ExtractorDecoratorTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/ExtractorDecoratorTest.java @@ -17,7 +17,6 @@ */ package org.apache.metron.dataloads.extractor; -import org.hamcrest.Matchers; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -28,6 +27,7 @@ import java.util.HashMap; import java.util.Map; +import static org.hamcrest.Matchers.notNullValue; import static org.mockito.Matchers.isA; import static org.mockito.Mockito.verify; @@ -44,7 +44,7 @@ public void before() { @Test public void sets_member_variables() { ExtractorDecorator decorator = new ExtractorDecorator(extractor); - Assert.assertThat(decorator.decoratedExtractor, Matchers.notNullValue()); + Assert.assertThat(decorator.decoratedExtractor, notNullValue()); } @Test diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecoratorTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecoratorTest.java index 80738b250e..61443c2658 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecoratorTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/extractor/TransformFilterExtractorDecoratorTest.java @@ -149,8 +149,7 @@ public void filters_indicators() throws Exception { @Test public void bad_value_transform_causes_exception() throws Exception { final int badValue = 5; - exception.expect(IllegalArgumentException.class); - exception.expectMessage("Object is not of type java.util.Map"); + exception.expect(ClassCastException.class); config1.put(TransformFilterExtractorDecorator.ExtractorOptions.VALUE_TRANSFORM.toString(), badValue); decorator = new TransformFilterExtractorDecorator(extractor); decorator.setZkClient(Optional.of(zkClient)); @@ -160,8 +159,7 @@ public void bad_value_transform_causes_exception() throws Exception { @Test public void bad_value_filter_causes_exception() throws Exception { final int badValue = 5; - exception.expect(IllegalArgumentException.class); - exception.expectMessage("Object is not of type java.lang.String"); + exception.expect(ClassCastException.class); config1.put(TransformFilterExtractorDecorator.ExtractorOptions.VALUE_FILTER.toString(), badValue); decorator = new TransformFilterExtractorDecorator(extractor); decorator.setZkClient(Optional.of(zkClient)); @@ -171,8 +169,7 @@ public void bad_value_filter_causes_exception() throws Exception { @Test public void bad_indicator_transform_causes_exception() throws Exception { final int badValue = 5; - exception.expect(IllegalArgumentException.class); - exception.expectMessage("Object is not of type java.util.Map"); + exception.expect(ClassCastException.class); config1.put(TransformFilterExtractorDecorator.ExtractorOptions.INDICATOR_TRANSFORM.toString(), badValue); decorator = new TransformFilterExtractorDecorator(extractor); decorator.setZkClient(Optional.of(zkClient)); @@ -182,8 +179,7 @@ public void bad_indicator_transform_causes_exception() throws Exception { @Test public void bad_indicator_filter_causes_exception() throws Exception { final int badValue = 5; - exception.expect(IllegalArgumentException.class); - exception.expectMessage("Object is not of type java.lang.String"); + exception.expect(ClassCastException.class); config1.put(TransformFilterExtractorDecorator.ExtractorOptions.INDICATOR_FILTER.toString(), badValue); decorator = new TransformFilterExtractorDecorator(extractor); decorator.setZkClient(Optional.of(zkClient)); diff --git a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java index 3911076b82..443d39dae7 100644 --- a/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java +++ b/metron-platform/metron-data-management/src/test/java/org/apache/metron/dataloads/nonbulk/flatfile/SimpleEnrichmentFlatFileLoaderIntegrationTest.java @@ -296,6 +296,7 @@ public static void teardown() throws Exception { lineByLineExtractorConfigFile.delete(); wholeFileExtractorConfigFile.delete(); stellarExtractorConfigFile.delete(); + customLineByLineExtractorConfigFile.delete(); }