From 2d4439c78b835ea7186e3c5ef65e9a594b064944 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 23 Apr 2017 01:19:29 -0700 Subject: [PATCH 1/6] trappy: Improve performance of parser This patch reduces the frequency of calling getrusage resulting in a ~3-5% performance improvement in parsing. We do a check in the beginning and then check once every 10000 events. Signed-off-by: Joel Fernandes --- trappy/base.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/trappy/base.py b/trappy/base.py index 8a1b976d..38fb84a3 100644 --- a/trappy/base.py +++ b/trappy/base.py @@ -174,9 +174,11 @@ def append_data(self, time, comm, pid, cpu, data): def generate_parsed_data(self): # Get a rough idea of how much memory we have to play with + CHECK_MEM_COUNT = 10000 kb_free = _get_free_memory_kb() starting_maxrss = getrusage(RUSAGE_SELF).ru_maxrss check_memory_usage = True + check_memory_count = 1 for (comm, pid, cpu, data_str) in zip(self.comm_array, self.pid_array, self.cpu_array, self.data_array): @@ -200,13 +202,20 @@ def generate_parsed_data(self): # When running out of memory, Pandas has been observed to segfault # rather than throwing a proper Python error. # Look at how much memory our process is using and warn if we seem - # to be getting close to the system's limit. + # to be getting close to the system's limit, check it only once + # in the beginning and then every CHECK_MEM_COUNT events + check_memory_count -= 1 + if check_memory_count != 0: + yield data_dict + continue + kb_used = (getrusage(RUSAGE_SELF).ru_maxrss - starting_maxrss) if check_memory_usage and kb_free and kb_used > kb_free * 0.9: warnings.warn("TRAPpy: Appear to be low on memory. " "If errors arise, try providing more RAM") check_memory_usage = False + check_memory_count = CHECK_MEM_COUNT yield data_dict def create_dataframe(self): From bffdec055edeeff916d93df186dd405dd5d2ba30 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 23 Apr 2017 01:46:28 -0700 Subject: [PATCH 2/6] trappy: Split out parsing of trace string into separate function In preparation of introducing event callbacks, split out string parsing into separate function. Signed-off-by: Joel Fernandes --- trappy/base.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/trappy/base.py b/trappy/base.py index 38fb84a3..0bacf9ea 100644 --- a/trappy/base.py +++ b/trappy/base.py @@ -171,6 +171,25 @@ def append_data(self, time, comm, pid, cpu, data): self.cpu_array.append(cpu) self.data_array.append(data) + def generate_data_dict(self, comm, pid, cpu, data_str): + data_dict = {"__comm": comm, "__pid": pid, "__cpu": cpu} + prev_key = None + for field in data_str.split(): + if "=" not in field: + # Concatenation is supported only for "string" values + if type(data_dict[prev_key]) is not str: + continue + data_dict[prev_key] += ' ' + field + continue + (key, value) = field.split('=', 1) + try: + value = int(value) + except ValueError: + pass + data_dict[key] = value + prev_key = key + return data_dict + def generate_parsed_data(self): # Get a rough idea of how much memory we have to play with @@ -182,22 +201,7 @@ def generate_parsed_data(self): for (comm, pid, cpu, data_str) in zip(self.comm_array, self.pid_array, self.cpu_array, self.data_array): - data_dict = {"__comm": comm, "__pid": pid, "__cpu": cpu} - prev_key = None - for field in data_str.split(): - if "=" not in field: - # Concatenation is supported only for "string" values - if type(data_dict[prev_key]) is not str: - continue - data_dict[prev_key] += ' ' + field - continue - (key, value) = field.split('=', 1) - try: - value = int(value) - except ValueError: - pass - data_dict[key] = value - prev_key = key + data_dict = self.generate_data_dict(comm, pid, cpu, data_str) # When running out of memory, Pandas has been observed to segfault # rather than throwing a proper Python error. From b1974af446b843f199d3a538081bfe553e3ff8a6 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 23 Apr 2017 02:27:29 -0700 Subject: [PATCH 3/6] trappy: Add support for ftrace event callbacks Event callback are called per-event depending on the trace class, the callback is passed the event's timestamp and a dict of all parsed fields. Signed-off-by: Joel Fernandes --- trappy/base.py | 6 ++++++ trappy/ftrace.py | 10 +++++++--- trappy/systrace.py | 5 +++-- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/trappy/base.py b/trappy/base.py index 0bacf9ea..a5431fbd 100644 --- a/trappy/base.py +++ b/trappy/base.py @@ -105,6 +105,7 @@ def __init__(self, parse_raw=False): self.comm_array = [] self.pid_array = [] self.cpu_array = [] + self.callback = None self.parse_raw = parse_raw def finalize_object(self): @@ -171,6 +172,11 @@ def append_data(self, time, comm, pid, cpu, data): self.cpu_array.append(cpu) self.data_array.append(data) + if not self.callback: + return + data_dict = self.generate_data_dict(comm, pid, cpu, data) + self.callback(time, data_dict) + def generate_data_dict(self, comm, pid, cpu, data_str): data_dict = {"__comm": comm, "__pid": pid, "__cpu": cpu} prev_key = None diff --git a/trappy/ftrace.py b/trappy/ftrace.py index 6a5fce0a..168ac593 100644 --- a/trappy/ftrace.py +++ b/trappy/ftrace.py @@ -54,7 +54,8 @@ class GenericFTrace(BareTrace): dynamic_classes = {} def __init__(self, name="", normalize_time=True, scope="all", - events=[], window=(0, None), abs_window=(0, None)): + events=[], event_callbacks={}, window=(0, None), + abs_window=(0, None)): super(GenericFTrace, self).__init__(name) if not hasattr(self, "needs_raw_parsing"): @@ -73,6 +74,8 @@ def __init__(self, name="", normalize_time=True, scope="all", for attr, class_def in self.class_definitions.iteritems(): trace_class = class_def() + if event_callbacks.has_key(attr): + trace_class.callback = event_callbacks[attr] setattr(self, attr, trace_class) self.trace_classes.append(trace_class) @@ -480,14 +483,15 @@ class FTrace(GenericFTrace): """ def __init__(self, path=".", name="", normalize_time=True, scope="all", - events=[], window=(0, None), abs_window=(0, None)): + events=[], event_callbacks={}, window=(0, None), + abs_window=(0, None)): self.trace_path, self.trace_path_raw = self.__process_path(path) self.needs_raw_parsing = True self.__populate_metadata() super(FTrace, self).__init__(name, normalize_time, scope, events, - window, abs_window) + event_callbacks, window, abs_window) def __process_path(self, basepath): """Process the path and return the path to the trace text file""" diff --git a/trappy/systrace.py b/trappy/systrace.py index 6e917a65..c6190114 100644 --- a/trappy/systrace.py +++ b/trappy/systrace.py @@ -50,12 +50,13 @@ class SysTrace(GenericFTrace): """ def __init__(self, path=".", name="", normalize_time=True, scope="all", - events=[], window=(0, None), abs_window=(0, None)): + events=[], event_callbacks={}, window=(0, None), + abs_window=(0, None)): self.trace_path = path super(SysTrace, self).__init__(name, normalize_time, scope, events, - window, abs_window) + event_callbacks, window, abs_window) try: self._cpus = 1 + self.sched_switch.data_frame["__cpu"].max() From dd299a8c5f2ec1025efdef48004d47798d35bfb5 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 23 Apr 2017 03:52:01 -0700 Subject: [PATCH 4/6] trace: Add support for skipping build of dataframes For event callback driven scripts, its useful to save time to not build data frames that aren't going to be used anyway, add a parameter to skip building of data frames. This saves ~30% of the time an event callback based script takes. Signed-off-by: Joel Fernandes --- trappy/bare_trace.py | 5 ++++- trappy/ftrace.py | 9 +++++---- trappy/systrace.py | 8 +++++--- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/trappy/bare_trace.py b/trappy/bare_trace.py index f3fbd58c..9787f04d 100644 --- a/trappy/bare_trace.py +++ b/trappy/bare_trace.py @@ -27,12 +27,13 @@ class BareTrace(object): """ - def __init__(self, name=""): + def __init__(self, name="", build_df=True): self.name = name self.normalized_time = False self.class_definitions = {} self.trace_classes = [] self.basetime = 0 + self.build_df = build_df def get_duration(self): """Returns the largest time value of all classes, @@ -133,6 +134,8 @@ def add_parsed_event(self, name, dfr, pivot=None): setattr(self, name, event) def finalize_objects(self): + if not self.build_df: + return for trace_class in self.trace_classes: trace_class.create_dataframe() trace_class.finalize_object() diff --git a/trappy/ftrace.py b/trappy/ftrace.py index 168ac593..a73d02d5 100644 --- a/trappy/ftrace.py +++ b/trappy/ftrace.py @@ -55,8 +55,8 @@ class GenericFTrace(BareTrace): def __init__(self, name="", normalize_time=True, scope="all", events=[], event_callbacks={}, window=(0, None), - abs_window=(0, None)): - super(GenericFTrace, self).__init__(name) + abs_window=(0, None), build_df=True): + super(GenericFTrace, self).__init__(name, build_df) if not hasattr(self, "needs_raw_parsing"): self.needs_raw_parsing = False @@ -484,14 +484,15 @@ class FTrace(GenericFTrace): def __init__(self, path=".", name="", normalize_time=True, scope="all", events=[], event_callbacks={}, window=(0, None), - abs_window=(0, None)): + abs_window=(0, None), build_df=True): self.trace_path, self.trace_path_raw = self.__process_path(path) self.needs_raw_parsing = True self.__populate_metadata() super(FTrace, self).__init__(name, normalize_time, scope, events, - event_callbacks, window, abs_window) + event_callbacks, window, abs_window, + build_df) def __process_path(self, basepath): """Process the path and return the path to the trace text file""" diff --git a/trappy/systrace.py b/trappy/systrace.py index c6190114..c11601da 100644 --- a/trappy/systrace.py +++ b/trappy/systrace.py @@ -51,13 +51,15 @@ class SysTrace(GenericFTrace): def __init__(self, path=".", name="", normalize_time=True, scope="all", events=[], event_callbacks={}, window=(0, None), - abs_window=(0, None)): + abs_window=(0, None), build_df=True): self.trace_path = path super(SysTrace, self).__init__(name, normalize_time, scope, events, - event_callbacks, window, abs_window) - + event_callbacks, window, abs_window, + build_df) + if not build_df: + return try: self._cpus = 1 + self.sched_switch.data_frame["__cpu"].max() except AttributeError: From 90c79d84a68dfd131151f897b99543411ee23a34 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 28 Apr 2017 21:59:05 -0700 Subject: [PATCH 5/6] trappy/sched: Move line processing earlier Instead of processing line at create_dataframe time, move it up to during the append, this avoids passing "==>" to the callbacks. No change in performance or functionality to existing users. Signed-off-by: Joel Fernandes --- trappy/sched.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/trappy/sched.py b/trappy/sched.py index 4a68f6a5..b8d9cc58 100644 --- a/trappy/sched.py +++ b/trappy/sched.py @@ -108,11 +108,10 @@ class SchedSwitch(Base): def __init__(self): super(SchedSwitch, self).__init__(parse_raw=True) - def create_dataframe(self): - self.data_array = [line.replace(" ==> ", " ", 1) - for line in self.data_array] - - super(SchedSwitch, self).create_dataframe() + def append_data(self, time, comm, pid, cpu, data): + data_rep = data.replace(" ==> ", " ") + super(SchedSwitch, self).append_data(time, comm, pid, cpu, + data_rep) register_ftrace_parser(SchedSwitch, "sched") From 0dfed97fd79b327f6840ca3b6c11dfc84f4a43f3 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 28 Apr 2017 22:11:47 -0700 Subject: [PATCH 6/6] trappy/ftrace: normalize time early normalize the time earlier than before, this is needed to make sure normalized time is passed to callbacks, and also removes the need to reset the index after dataframe generation thus improving perf for the regular cases. Signed-off-by: Joel Fernandes --- trappy/base.py | 12 ------------ trappy/ftrace.py | 6 +++--- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/trappy/base.py b/trappy/base.py index a5431fbd..aebda802 100644 --- a/trappy/base.py +++ b/trappy/base.py @@ -256,15 +256,3 @@ def write_csv(self, fname): :type fname: str """ self.data_frame.to_csv(fname) - - def normalize_time(self, basetime): - """Substract basetime from the Time of the data frame - - :param basetime: The offset which needs to be subtracted from - the time index - :type basetime: float - """ - if basetime and not self.data_frame.empty: - self.data_frame.reset_index(inplace=True) - self.data_frame["Time"] = self.data_frame["Time"] - basetime - self.data_frame.set_index("Time", inplace=True) diff --git a/trappy/ftrace.py b/trappy/ftrace.py index a73d02d5..4726a3f5 100644 --- a/trappy/ftrace.py +++ b/trappy/ftrace.py @@ -85,9 +85,6 @@ def __init__(self, name="", normalize_time=True, scope="all", raw=True) self.finalize_objects() - if normalize_time: - self.normalize_time() - @classmethod def register_parser(cls, cobject, scope): """Register the class as an Event. This function @@ -210,6 +207,9 @@ def contains_unique_word(line, unique_words=cls_for_unique_word.keys()): except AttributeError: continue + if self.normalize_time: + timestamp = timestamp - self.basetime + data_str = line[data_start_idx:] # Remove empty arrays from the trace