diff --git a/test_data/build_inverted_and_forward.py b/test_data/build_inverted_and_forward.py index 743b491..c47ea17 100644 --- a/test_data/build_inverted_and_forward.py +++ b/test_data/build_inverted_and_forward.py @@ -36,7 +36,7 @@ discard = False for i in range(1, len(x)): try: - term = x[i].encode('utf-8') + term = x[i] try: term_id = tokens[term] if term_id not in mapped: diff --git a/test_data/build_stats.py b/test_data/build_stats.py index f9923f0..5fdfdb7 100644 --- a/test_data/build_stats.py +++ b/test_data/build_stats.py @@ -35,4 +35,5 @@ output_file.write(str(len(nodes_per_level)) + "\n") for key, value in sorted(nodes_per_level.iteritems(), key = lambda kv: kv[0]): output_file.write(str(value) + "\n") -output_file.close() \ No newline at end of file +output_file.close() + diff --git a/test_data/extract_dict.py b/test_data/extract_dict.py index 875f85b..0672351 100644 --- a/test_data/extract_dict.py +++ b/test_data/extract_dict.py @@ -21,5 +21,5 @@ dict_file = open(input_filename + ".dict", 'w') for key in sorted(tokens): - dict_file.write(key.encode('utf-8') + "\n") -dict_file.close() \ No newline at end of file + dict_file.write(key + "\n") +dict_file.close() diff --git a/test_data/map_dataset.py b/test_data/map_dataset.py index 86e6357..beb7155 100644 --- a/test_data/map_dataset.py +++ b/test_data/map_dataset.py @@ -24,7 +24,7 @@ string_len = 0; mapped = [x[0]] for i in range(1, len(x)): # x[0] stores the docID - t = x[i].encode('utf-8') + t = x[i] try: id = tokens[t] mapped.append(id) @@ -48,4 +48,4 @@ stats_file.write(str(len(tokens)) + "\n") stats_file.write(str(max_string_len) + "\n") -stats_file.close() \ No newline at end of file +stats_file.close()