From 395cc6f2fd1606a6a0d42b670744c8febbd7ab9e Mon Sep 17 00:00:00 2001
From: "Anastasiia.Birillo" <nbirillo@mail.ru>
Date: Fri, 21 May 2021 16:20:17 +0300
Subject: [PATCH 1/4] Add a script for filtering inspections

---
 src/python/evaluation/common/util.py          |  6 ++
 .../evaluation/inspectors/filter_issues.py    |  8 +--
 src/python/evaluation/qodana/README.md        | 40 ++++++++++++-
 .../evaluation/qodana/dataset_marking.py      | 12 +---
 .../evaluation/qodana/filter_inspections.py   | 60 +++++++++++++++++++
 src/python/evaluation/qodana/util/util.py     | 11 ++++
 src/python/review/common/file_system.py       |  9 ++-
 whitelist.txt                                 |  1 +
 8 files changed, 130 insertions(+), 17 deletions(-)
 create mode 100644 src/python/evaluation/qodana/filter_inspections.py
 create mode 100644 src/python/evaluation/qodana/util/util.py

diff --git a/src/python/evaluation/common/util.py b/src/python/evaluation/common/util.py
index 271956f1..585e4ddd 100644
--- a/src/python/evaluation/common/util.py
+++ b/src/python/evaluation/common/util.py
@@ -1,4 +1,5 @@
 from enum import Enum, unique
+from typing import Set
 
 from src.python.review.application_config import LanguageVersion
 from src.python.review.common.file_system import Extension
@@ -39,3 +40,8 @@ class EvaluationArgument(Enum):
                          f'Acceptable language-names are: {LanguageVersion.PYTHON_3.value}, '
                          f'{LanguageVersion.JAVA_8.value} ,'
                          f'{LanguageVersion.JAVA_11.value} and {LanguageVersion.KOTLIN.value}.')
+
+
+# Split string by separator
+def parse_set_arg(str_arg: str, separator: str = ',') -> Set[str]:
+    return set(str_arg.split(separator))
diff --git a/src/python/evaluation/inspectors/filter_issues.py b/src/python/evaluation/inspectors/filter_issues.py
index ca4b38b6..6a4b115d 100644
--- a/src/python/evaluation/inspectors/filter_issues.py
+++ b/src/python/evaluation/inspectors/filter_issues.py
@@ -5,7 +5,7 @@
 import pandas as pd
 from src.python.common.tool_arguments import RunToolArgument
 from src.python.evaluation.common.pandas_util import get_issues_from_json, get_solutions_df_by_file_path
-from src.python.evaluation.common.util import ColumnName, EvaluationArgument
+from src.python.evaluation.common.util import ColumnName, EvaluationArgument, parse_set_arg
 from src.python.review.common.file_system import Extension, get_parent_folder, serialize_data_and_write_to_file
 from src.python.review.inspectors.issue import BaseIssue
 
@@ -26,10 +26,6 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None:
                         default='')
 
 
-def __parse_issues_arg(str_issues: str) -> Set[str]:
-    return set(str_issues.split(','))
-
-
 def __get_new_issues(traceback: str, new_issues_classes: Set[str]) -> List[BaseIssue]:
     all_issues = get_issues_from_json(traceback)
     return list(filter(lambda i: i.origin_class in new_issues_classes, all_issues))
@@ -59,7 +55,7 @@ def main() -> None:
 
     solutions_file_path = args.solutions_file_path
     solutions_df = get_solutions_df_by_file_path(solutions_file_path)
-    issues = __parse_issues_arg(args.issues)
+    issues = parse_set_arg(args.issues)
 
     diffs = get_statistics_dict(solutions_df, issues)
     output_path = get_parent_folder(Path(solutions_file_path)) / f'diffs{Extension.PICKLE.value}'
diff --git a/src/python/evaluation/qodana/README.md b/src/python/evaluation/qodana/README.md
index 35ccb782..a18748a4 100644
--- a/src/python/evaluation/qodana/README.md
+++ b/src/python/evaluation/qodana/README.md
@@ -1,5 +1,6 @@
 # Dataset label
-This script allows you to mark up a dataset using the found [Qodana](https://github.com/JetBrains/Qodana) inspections.
+
+[This](dataset_marking.py) script allows you to mark up a dataset using the found [Qodana](https://github.com/JetBrains/Qodana) inspections.
 
 The dataset must contain at least three columns: `id`, `code` and `lang`, where `id` is a unique solution number, `lang` is the language in which the code is written in the `code` column. The `lang` must belong to one of the following values: `java7`, `java8`, `java9`, `java11`, `python3`, `kotlin`. If `lang` is not equal to any of the values, the row will be skipped.
 
@@ -21,3 +22,40 @@ Run the [dataset_marking.py](dataset_marking.py) with the arguments from command
 | **&#8209;l**, **&#8209;&#8209;limit** | Allows you to read only the specified number of first rows from the dataset. If no limit is specified, the whole dataset will be processed. |
 | **&#8209;s**, **&#8209;&#8209;chunk&#8209;size** | The number of files that Qodana will process at a time. Default is `5000`. |
 | **&#8209;o**, **&#8209;&#8209;dataset&#8209;output&#8209;path** | The path where the marked dataset will be saved. If not specified, the original dataset will be overwritten. |
+
+---
+
+# Postprocessing
+
+The model that imitates Qodana analysis gets input from a dataset in a special format. 
+This module allows preparing datasets that were graded by [dataset_marking.py](dataset_marking.py) script.
+
+Data processing consists of several stages:
+- union several `csv` files that were graded by [dataset_marking.py](dataset_marking.py) script 
+  and filter inspections list if it is necessary;
+- convert `csv` file into a special format.
+
+## Filter inspections
+
+This stage allow you to union several `csv` files that were graded by [dataset_marking.py](dataset_marking.py) script 
+  and filter inspections list if it is necessary.
+
+Please, note that your all input files must be graded by [dataset_marking.py](dataset_marking.py) script 
+and have `inspections` column.
+
+Output file is a new `csv` file with the all columns from the input files.
+
+#### Usage
+
+Run the [filter_inspections.py](filter_inspections.py) with the arguments from command line.
+
+Required arguments:
+
+`dataset_folder` — path to a folder with csv files graded by Qodana. Each file must have `inspections` column.
+
+Optional arguments:
+Argument | Description
+--- | ---
+|**&#8209;i**, **&#8209;&#8209;inspections**| Set of inspections ids to exclude from the dataset separated by comma. By default all inspections remain. |
+
+The resulting file will be stored in the `dataset_folder`.
diff --git a/src/python/evaluation/qodana/dataset_marking.py b/src/python/evaluation/qodana/dataset_marking.py
index 00f17667..64716d27 100644
--- a/src/python/evaluation/qodana/dataset_marking.py
+++ b/src/python/evaluation/qodana/dataset_marking.py
@@ -19,7 +19,8 @@
 from pandas import DataFrame
 from src.python.evaluation.common.csv_util import write_dataframe_to_csv
 from src.python.evaluation.common.util import ColumnName
-from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue, QodanaJsonField
+from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue
+from src.python.evaluation.qodana.util.util import to_json
 from src.python.review.application_config import LanguageVersion
 from src.python.review.common.file_system import (
     create_directory, get_content_from_file, get_name_from_path, get_parent_folder, remove_directory, remove_slash,
@@ -179,13 +180,6 @@ def _parse_inspections_files(cls, inspections_files: Set[Path]) -> Dict[int, Lis
                 id_to_issues[fragment_id].append(qodana_issue)
         return id_to_issues
 
-    @classmethod
-    def _to_json(cls, issues: List[QodanaIssue]) -> str:
-        issues_json = {
-            QodanaJsonField.ISSUES.value: list(map(lambda i: i.to_json(), issues)),
-        }
-        return json.dumps(issues_json)
-
     def _mark_chunk(self, chunk: DataFrame, language: LanguageVersion, chunk_id: int) -> pd.DataFrame:
         tmp_file_path = self.dataset_path.parent.absolute() / f'qodana_project_{chunk_id}'
         create_directory(tmp_file_path)
@@ -212,7 +206,7 @@ def _mark_chunk(self, chunk: DataFrame, language: LanguageVersion, chunk_id: int
 
         logger.info("Write inspections")
         chunk[QodanaColumnName.INSPECTIONS.value] = chunk.apply(
-            lambda row: self._to_json(inspections.get(row[ColumnName.ID.value], [])), axis=1)
+            lambda row: to_json(inspections.get(row[ColumnName.ID.value], [])), axis=1)
 
         remove_directory(tmp_file_path)
         return chunk
diff --git a/src/python/evaluation/qodana/filter_inspections.py b/src/python/evaluation/qodana/filter_inspections.py
new file mode 100644
index 00000000..6f758965
--- /dev/null
+++ b/src/python/evaluation/qodana/filter_inspections.py
@@ -0,0 +1,60 @@
+import argparse
+import json
+from pathlib import Path
+from typing import List
+
+import pandas as pd
+from src.python.evaluation.common.csv_util import write_dataframe_to_csv
+from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path
+from src.python.evaluation.common.util import parse_set_arg
+from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue, QodanaJsonField
+from src.python.evaluation.qodana.util.util import to_json
+from src.python.review.common.file_system import Extension, extension_file_condition, get_all_file_system_items
+
+
+def configure_arguments(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument('dataset_folder',
+                        type=lambda value: Path(value).absolute(),
+                        help='Path to a folder with csv files graded by Qodana. '
+                             'Each file must have "inspections" column.')
+
+    parser.add_argument('-i', '--inspections',
+                        help='Set of inspections ids to exclude from the dataset',
+                        type=str,
+                        default='')
+
+
+def __get_qodana_dataset(root: Path) -> pd.DataFrame:
+    if not root.is_dir():
+        raise ValueError(f'The {root} is not a directory')
+    dataset_files = get_all_file_system_items(root, extension_file_condition(Extension.CSV))
+    datasets = []
+    for file_path in dataset_files:
+        datasets.append(get_solutions_df_by_file_path(file_path))
+    return pd.concat(datasets)
+
+
+def __filter_inspections(json_issues: str, inspections_to_keep: List[str]) -> str:
+    issues_list = json.loads(json_issues)[QodanaJsonField.ISSUES.value]
+    filtered_issues = list(filter(lambda i: i.problem_id not in inspections_to_keep,
+                                  map(lambda i: QodanaIssue.from_json(i), issues_list)))
+    return to_json(filtered_issues)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    configure_arguments(parser)
+    args = parser.parse_args()
+
+    dataset_folder = args.dataset_folder
+    full_dataset = __get_qodana_dataset(dataset_folder)
+    inspections_to_keep = parse_set_arg(args.inspections)
+
+    full_dataset[QodanaColumnName.INSPECTIONS.value] = full_dataset.apply(
+        lambda row: __filter_inspections(row[QodanaColumnName.INSPECTIONS.value], inspections_to_keep), axis=1)
+
+    write_dataframe_to_csv(dataset_folder / f'filtered_issues{Extension.CSV.value}', full_dataset)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/python/evaluation/qodana/util/util.py b/src/python/evaluation/qodana/util/util.py
new file mode 100644
index 00000000..0c4b8712
--- /dev/null
+++ b/src/python/evaluation/qodana/util/util.py
@@ -0,0 +1,11 @@
+import json
+from typing import List
+
+from src.python.evaluation.qodana.util.models import QodanaIssue, QodanaJsonField
+
+
+def to_json(issues: List[QodanaIssue]) -> str:
+    issues_json = {
+        QodanaJsonField.ISSUES.value: list(map(lambda i: i.to_json(), issues)),
+    }
+    return json.dumps(issues_json)
diff --git a/src/python/review/common/file_system.py b/src/python/review/common/file_system.py
index 3e2e8bce..b2ff6dcb 100644
--- a/src/python/review/common/file_system.py
+++ b/src/python/review/common/file_system.py
@@ -50,6 +50,13 @@ def all_items_condition(name: str) -> bool:
     return True
 
 
+def extension_file_condition(extension: Extension) -> ItemCondition:
+    def has_this_extension(name: str) -> bool:
+        return get_extension_from_file(name) == extension
+
+    return has_this_extension
+
+
 # To get all files or subdirs (depends on the last parameter) from root that match item_condition
 # Note that all subdirs or files already contain the full path for them
 def get_all_file_system_items(root: Path, item_condition: ItemCondition = all_items_condition,
@@ -149,7 +156,7 @@ def get_content_from_file(file_path: Path, encoding: str = Encoding.ISO_ENCODING
 
 # Not empty extensions are returned with a dot, for example, '.txt'
 # If file has no extensions, an empty one ('') is returned
-def get_extension_from_file(file: Path) -> Extension:
+def get_extension_from_file(file: Union[Path, str]) -> Extension:
     return Extension(os.path.splitext(file)[1])
 
 
diff --git a/whitelist.txt b/whitelist.txt
index 3e331750..e7c8e657 100644
--- a/whitelist.txt
+++ b/whitelist.txt
@@ -114,3 +114,4 @@ nrows
 groupby
 getuid
 Popen
+datasets

From 082ca6d31aa67b19c3b0fb7cb6a3d78a341c7535 Mon Sep 17 00:00:00 2001
From: Nastya Birillo <anastasia.i.birillo@gmail.com>
Date: Mon, 24 May 2021 20:01:09 +0300
Subject: [PATCH 2/4] Qoadana handlers/get unique inspections (#35)

Add handlers for getting unique inspections
Add a script to convert data for a model
Add a script for preprocessing data for the second qodana model (inspections line by line)
---
 src/python/common/tool_arguments.py           |   7 +
 src/python/evaluation/README.md               |   6 +-
 src/python/evaluation/inspectors/README.md    |   4 +-
 .../inspectors/print_inspectors_statistics.py |   4 +-
 src/python/evaluation/qodana/README.md        | 175 ++++++++++++++++++
 .../evaluation/qodana/filter_inspections.py   |   8 +-
 .../qodana/fragment_to_inspections_list.py    |  33 ++++
 ...agment_to_inspections_list_line_by_line.py |  62 +++++++
 .../qodana/get_unique_inspectors.py           |  94 ++++++++++
 src/python/evaluation/qodana/util/models.py   |   9 +
 src/python/evaluation/qodana/util/util.py     |  44 ++++-
 whitelist.txt                                 |   2 +
 12 files changed, 434 insertions(+), 14 deletions(-)
 create mode 100644 src/python/evaluation/qodana/fragment_to_inspections_list.py
 create mode 100644 src/python/evaluation/qodana/fragment_to_inspections_list_line_by_line.py
 create mode 100644 src/python/evaluation/qodana/get_unique_inspectors.py

diff --git a/src/python/common/tool_arguments.py b/src/python/common/tool_arguments.py
index d3048051..65af5a53 100644
--- a/src/python/common/tool_arguments.py
+++ b/src/python/common/tool_arguments.py
@@ -89,3 +89,10 @@ class RunToolArgument(Enum):
 
     DIFFS_FILE_PATH = ArgumentsInfo(None, 'diffs_file_path',
                                     'Path to a file with serialized diffs that were founded by diffs_between_df.py')
+
+    QODANA_SOLUTIONS_FILE_PATH = ArgumentsInfo(None, 'solutions_file_path',
+                                               'Csv file with solutions. This file must be graded by Qodana.')
+
+    QODANA_INSPECTIONS_PATH = ArgumentsInfo(None, 'inspections_path', 'Path to a CSV file with inspections list.')
+
+    QODANA_DUPLICATES = ArgumentsInfo(None, '--remove-duplicates', 'Remove duplicates around inspections')
diff --git a/src/python/evaluation/README.md b/src/python/evaluation/README.md
index 5aa4bdf7..af2dbbd8 100644
--- a/src/python/evaluation/README.md
+++ b/src/python/evaluation/README.md
@@ -29,7 +29,7 @@ Optional arguments:
 Argument | Description
 --- | ---
 |**&#8209;f**, **&#8209;&#8209;format**| The output format. Available values: `json`, `text`. The default value is `json` . Use this argument when `traceback` is enabled, otherwise it will not be used.|
-|**&#8209;tp**, **&#8209;&#8209;tool_path**| Path to run-tool. Default is `src/python/review/run_tool.py` .|
+|**&#8209;tp**, **&#8209;&#8209;tool&#8209path**| Path to run-tool. Default is `src/python/review/run_tool.py` .|
 |**&#8209;&#8209;traceback**| To include a column with errors traceback into an output file. Default is `False`.|
-|**&#8209;ofp**, **&#8209;&#8209;output_folder_path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. |
-|**&#8209;ofn**, **&#8209;&#8209;output_file_name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.|
+|**&#8209;ofp**, **&#8209;&#8209;output&#8209folder&#8209path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. |
+|**&#8209;ofn**, **&#8209;&#8209;output&#8209file&#8209name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.|
diff --git a/src/python/evaluation/inspectors/README.md b/src/python/evaluation/inspectors/README.md
index a0de1314..5c54fe93 100644
--- a/src/python/evaluation/inspectors/README.md
+++ b/src/python/evaluation/inspectors/README.md
@@ -161,8 +161,8 @@ Optional arguments:
 Argument | Description
 --- | ---
 |**&#8209;&#8209;categorize**| If True, statistics will be categorized by several categories. By default is disabled.|
-|**&#8209;n**, **&#8209;&#8209;top_n**| The top N items will be printed. Default value is 10.|
-|**&#8209;&#8209;full_stat**| If True, full statistics (with all issues) will be printed. By default is disabled.|
+|**&#8209;n**, **&#8209;&#8209;top&#8209;n**| The top N items will be printed. Default value is 10.|
+|**&#8209;&#8209;full&#8209;stat**| If True, full statistics (with all issues) will be printed. By default is disabled.|
 
 The statistics will be printed into console.
 
diff --git a/src/python/evaluation/inspectors/print_inspectors_statistics.py b/src/python/evaluation/inspectors/print_inspectors_statistics.py
index 8b132a31..0a5605dd 100644
--- a/src/python/evaluation/inspectors/print_inspectors_statistics.py
+++ b/src/python/evaluation/inspectors/print_inspectors_statistics.py
@@ -19,12 +19,12 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None:
                         help='If True, statistics will be categorized by several categories.',
                         action='store_true')
 
-    parser.add_argument('-n', '--top_n',
+    parser.add_argument('-n', '--top-n',
                         help='The top N items will be printed',
                         type=int,
                         default=10)
 
-    parser.add_argument('--full_stat',
+    parser.add_argument('--full-stat',
                         help='If True, full statistics will be printed.',
                         action='store_true')
 
diff --git a/src/python/evaluation/qodana/README.md b/src/python/evaluation/qodana/README.md
index a18748a4..4e78972c 100644
--- a/src/python/evaluation/qodana/README.md
+++ b/src/python/evaluation/qodana/README.md
@@ -33,6 +33,7 @@ This module allows preparing datasets that were graded by [dataset_marking.py](d
 Data processing consists of several stages:
 - union several `csv` files that were graded by [dataset_marking.py](dataset_marking.py) script 
   and filter inspections list if it is necessary;
+- get all unique inspections from the dataset;
 - convert `csv` file into a special format.
 
 ## Filter inspections
@@ -59,3 +60,177 @@ Argument | Description
 |**&#8209;i**, **&#8209;&#8209;inspections**| Set of inspections ids to exclude from the dataset separated by comma. By default all inspections remain. |
 
 The resulting file will be stored in the `dataset_folder`.
+
+___
+
+## Get all unique inspections
+
+This stage allow you to get all unique inspections from a `csv` file graded by Qodana. 
+Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script 
+and has `inspections` column.
+
+Output file is a new `csv` file with four columns: `id`, `inspection_id`, `count_all`, `count_uniq`. 
+`id` is unique number for each inspection, minimal value is 1.
+`inspection_id` is unique Qoadana id for each inspection.
+`count_all` count all fragments where was this inspection (with duplicates).
+`count_uniq` count all fragments where was this inspection (without duplicates).
+
+#### Usage
+
+Run the [get_unique_inspectors.py](get_unique_inspectors.py) with the arguments from command line.
+
+Required arguments:
+
+`solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script.
+
+Optional arguments:
+Argument | Description
+--- | ---
+|**&#8209;&#8209;uniq**| To count all fragments for each inspection where was this inspection (without duplicates). By default it disabled. |
+
+The resulting file will be stored in the same folder as the input file.
+
+An example of the output file:
+
+```json
+id   |  inspection_id      |  count_all   |  count_unique
+-----|---------------------|--------------|--------------
+1    |  SystemOutErr       |    5         |     2
+2    |  ConstantExpression |    1         |     1
+```
+
+___
+
+#### Convert `csv` file into a special format
+
+This block describes what format can be converted csv-file with code samples 
+graded by [dataset_marking.py](dataset_marking.py) script.
+
+We have two different formats:
+- fragment to inspections list;
+- fragment to inspections list with positions.
+
+
+#### Fragment to inspections list
+
+This data representation match code fragments to a list with ids of inspections.
+
+Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script 
+and has `inspections` column.
+
+Output file is a new `csv` file with a new `inspections` column with list with ids of inspections. 
+If the list of inspections for the fragment is empty, then write 0.
+
+#### Usage
+
+Run the [fragment_to_inspections_list.py](fragment_to_inspections_list.py) with the arguments from command line.
+
+Required arguments:
+
+- `solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script,
+- `inspections_path` — path to csv-file with inspections list from the input file. You can get this file by [get_unique_inspectors.py](get_unique_inspectors.py) script.
+
+Optional arguments:
+Argument | Description
+--- | ---
+|**&#8209;&#8209;remove&#8209;duplicates**| Remove duplicates around inspections in each row. Default value is `False`. |
+
+The resulting file will be stored in the same folder as the input file.
+
+An example of the input file:
+
+```json
+id   |  code             |  lang         |  inspections
+-----|-------------------|---------------|-----------------
+2    |  "// some code"   |  java11       |  "{""issues"": []}"
+3    |  "// some code"   |  java11       |  "{""issues"": [""{\"... \""problem_id\"": \""SystemOutErr\""}""]}"
+0    |  "// some code"   |  java11       |  "{""issues"": [""{\"...\""problem_id\"": \""ConstantExpression\""}"",""{\"...\""problem_id\"": \""ConstantExpression\""}""]}"
+1    |  "// some code"   |  java11       |  "{""issues"": []}"
+```
+
+with the inspections file: 
+
+```json
+id   |  inspection_id    
+-----|-------------------
+1    |  SystemOutErr   
+2    |  ConstantExpression
+```
+
+An example of the output file:
+
+```json
+id   |  code             |  lang         |  inspections
+-----|-------------------|---------------|-----------------
+2    |  "// some code"   |  java11       |  0
+3    |  "// some code"   |  java11       |  1
+0    |  "// some code"   |  java11       |  2,2
+1    |  "// some code"   |  java11       |  0
+
+```
+
+---
+
+#### Fragment to inspections list with positions
+
+This data representation match each line in code fragments to a list with ids of inspections in this line.
+
+Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script 
+and has `inspections` column.
+
+Output file is a new `csv` file with a new `inspections` column with list with ids of inspections. 
+If the list of inspections for the fragment is empty, then write 0. 
+Note, that each line in code fragments in the new file is stored in a separate row. 
+All indents as well as blank lines are keeped.
+
+#### Usage
+
+Run the [fragment_to_inspections_list_line_by_line.py](fragment_to_inspections_list_line_by_line.py) with the arguments from command line.
+
+Required arguments:
+
+- `solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script,
+- `inspections_path` — path to csv-file with inspections list from the input file. You can get this file by [get_unique_inspectors.py](get_unique_inspectors.py) script.
+
+Optional arguments:
+Argument | Description
+--- | ---
+|**&#8209;&#8209;remove&#8209;duplicates**| Remove duplicates around inspections in each row. Default value is `False`. |
+
+The resulting file will be stored in the same folder as the input file.
+
+An example of the input file:
+
+```json
+id   |  code             |  lang         |  inspections
+-----|-------------------|---------------|-----------------
+2    |  "// some code"   |  java11       |  "{""issues"": []}"
+3    |  "// some code"   |  java11       |  "{""issues"": [""{\"... \""problem_id\"": \""SystemOutErr\""}""]}"
+0    |  "// some code"   |  java11       |  "{""issues"": [""{\"...\""problem_id\"": \""ConstantExpression\""}"",""{\"...\""problem_id\"": \""ConstantExpression\""}""]}"
+1    |  "// some code"   |  java11       |  "{""issues"": []}"
+```
+
+with the inspections file: 
+
+```json
+id   |  inspection_id    
+-----|-------------------
+1    |  SystemOutErr   
+2    |  ConstantExpression
+```
+
+An example of the output file:
+
+```json
+id   |  code                                  |  lang         |  inspections
+-----|----------------------------------------|---------------|-----------------
+2    |  "// first line from code with id 2"   |  java11       |  0
+2    |  "// second line from code with id 2"  |  java11       |  0
+3    |  "// first line from code with id 3"   |  java11       |  1
+3    |  "// second line from code with id 3"  |  java11       |  0
+0    |  "// first line from code with id 0"   |  java11       |  0
+0    |  "// second line from code with id 0"  |  java11       |  2,2
+1    |  "// first line from code with id 1"   |  java11       |  0
+1    |  "// second line from code with id 1"  |  java11       |  0
+
+```
diff --git a/src/python/evaluation/qodana/filter_inspections.py b/src/python/evaluation/qodana/filter_inspections.py
index 6f758965..9321a7eb 100644
--- a/src/python/evaluation/qodana/filter_inspections.py
+++ b/src/python/evaluation/qodana/filter_inspections.py
@@ -1,5 +1,4 @@
 import argparse
-import json
 from pathlib import Path
 from typing import List
 
@@ -7,7 +6,7 @@
 from src.python.evaluation.common.csv_util import write_dataframe_to_csv
 from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path
 from src.python.evaluation.common.util import parse_set_arg
-from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue, QodanaJsonField
+from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue
 from src.python.evaluation.qodana.util.util import to_json
 from src.python.review.common.file_system import Extension, extension_file_condition, get_all_file_system_items
 
@@ -35,9 +34,8 @@ def __get_qodana_dataset(root: Path) -> pd.DataFrame:
 
 
 def __filter_inspections(json_issues: str, inspections_to_keep: List[str]) -> str:
-    issues_list = json.loads(json_issues)[QodanaJsonField.ISSUES.value]
-    filtered_issues = list(filter(lambda i: i.problem_id not in inspections_to_keep,
-                                  map(lambda i: QodanaIssue.from_json(i), issues_list)))
+    issues_list = QodanaIssue.parse_list_issues_from_json(json_issues)
+    filtered_issues = list(filter(lambda i: i.problem_id not in inspections_to_keep, issues_list))
     return to_json(filtered_issues)
 
 
diff --git a/src/python/evaluation/qodana/fragment_to_inspections_list.py b/src/python/evaluation/qodana/fragment_to_inspections_list.py
new file mode 100644
index 00000000..42fe3ec6
--- /dev/null
+++ b/src/python/evaluation/qodana/fragment_to_inspections_list.py
@@ -0,0 +1,33 @@
+import argparse
+from pathlib import Path
+
+from src.python.evaluation.common.csv_util import write_dataframe_to_csv
+from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path
+from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue
+from src.python.evaluation.qodana.util.util import (
+    configure_model_converter_arguments, get_inspections_dict, replace_inspections_on_its_ids,
+)
+from src.python.review.common.file_system import Extension, get_parent_folder
+
+INSPECTIONS = QodanaColumnName.INSPECTIONS.value
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    configure_model_converter_arguments(parser)
+    args = parser.parse_args()
+
+    solutions_file_path = args.solutions_file_path
+    solutions_df = get_solutions_df_by_file_path(solutions_file_path)
+    inspections_dict = get_inspections_dict(args.inspections_path)
+
+    solutions_df[INSPECTIONS] = solutions_df.apply(
+        lambda row: replace_inspections_on_its_ids(QodanaIssue.parse_list_issues_from_json(row[INSPECTIONS]),
+                                                   inspections_dict, args.remove_duplicates), axis=1)
+
+    output_path = get_parent_folder(Path(solutions_file_path))
+    write_dataframe_to_csv(output_path / f'numbered_ids{Extension.CSV.value}', solutions_df)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/python/evaluation/qodana/fragment_to_inspections_list_line_by_line.py b/src/python/evaluation/qodana/fragment_to_inspections_list_line_by_line.py
new file mode 100644
index 00000000..c70d9ba1
--- /dev/null
+++ b/src/python/evaluation/qodana/fragment_to_inspections_list_line_by_line.py
@@ -0,0 +1,62 @@
+import argparse
+import os
+from itertools import groupby
+from pathlib import Path
+from typing import Dict, List
+
+import pandas as pd
+from src.python.evaluation.common.csv_util import write_dataframe_to_csv
+from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path
+from src.python.evaluation.common.util import ColumnName
+from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue
+from src.python.evaluation.qodana.util.util import (
+    configure_model_converter_arguments, get_inspections_dict, replace_inspections_on_its_ids,
+)
+from src.python.review.common.file_system import Extension, get_parent_folder
+
+
+INSPECTIONS = QodanaColumnName.INSPECTIONS.value
+CODE = ColumnName.CODE.value
+
+
+# Make a new dataframe where code fragment is separated line by line and inspections are grouped line by line
+def __replace_inspections_to_its_ids_in_row(row: pd.Series, inspections_dict: Dict[str, int],
+                                            to_remove_duplicates: bool) -> pd.DataFrame:
+    row_df = pd.DataFrame(row).transpose()
+    fragment_lines = row_df.iloc[0][CODE].split(os.linesep)
+    fragment_df = row_df.loc[row_df.index.repeat(len(fragment_lines))].reset_index(drop=True)
+
+    issues_list = QodanaIssue.parse_list_issues_from_json(row_df.iloc[0][INSPECTIONS])
+    line_number_to_issues = {k: list(v) for k, v in groupby(issues_list, key=lambda i: i.line)}
+    for index, fragment_line in enumerate(fragment_lines):
+        issues = line_number_to_issues.get(index + 1, [])
+        fragment_df.iloc[index][CODE] = fragment_line
+        fragment_df.iloc[index][INSPECTIONS] = replace_inspections_on_its_ids(issues, inspections_dict,
+                                                                              to_remove_duplicates)
+    return fragment_df
+
+
+def __append_df(df: pd.DataFrame, df_list: List[pd.DataFrame]) -> None:
+    df_list.append(df)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    configure_model_converter_arguments(parser)
+    args = parser.parse_args()
+
+    solutions_file_path = args.solutions_file_path
+    solutions_df = get_solutions_df_by_file_path(solutions_file_path)
+    inspections_dict = get_inspections_dict(args.inspections_path)
+
+    fragment_df_list = []
+    solutions_df.apply(
+        lambda row: __append_df(__replace_inspections_to_its_ids_in_row(row, inspections_dict, args.remove_duplicates),
+                                fragment_df_list), axis=1)
+
+    output_path = get_parent_folder(Path(solutions_file_path))
+    write_dataframe_to_csv(output_path / f'numbered_ids_line_by_line{Extension.CSV.value}', pd.concat(fragment_df_list))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/python/evaluation/qodana/get_unique_inspectors.py b/src/python/evaluation/qodana/get_unique_inspectors.py
new file mode 100644
index 00000000..35c32bdb
--- /dev/null
+++ b/src/python/evaluation/qodana/get_unique_inspectors.py
@@ -0,0 +1,94 @@
+import argparse
+import itertools
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional
+
+import pandas as pd
+from src.python.common.tool_arguments import RunToolArgument
+from src.python.evaluation.common.csv_util import write_dataframe_to_csv
+from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path
+from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue
+from src.python.review.common.file_system import Extension, get_parent_folder
+
+
+INSPECTION_ID = QodanaColumnName.INSPECTION_ID.value
+INSPECTIONS = QodanaColumnName.INSPECTIONS.value
+COUNT_ALL = QodanaColumnName.COUNT_ALL.value
+COUNT_UNIQUE = QodanaColumnName.COUNT_UNIQUE.value
+ID = QodanaColumnName.ID.value
+
+
+def configure_arguments(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument(RunToolArgument.QODANA_SOLUTIONS_FILE_PATH.value.long_name,
+                        type=lambda value: Path(value).absolute(),
+                        help=RunToolArgument.QODANA_SOLUTIONS_FILE_PATH.value.description)
+
+    parser.add_argument('--uniq',
+                        help='If True, count fragments for eash inspection in which this inspection was.',
+                        action='store_true')
+
+
+def __get_inspections_ids(json_issues: str) -> List[str]:
+    issues_list = QodanaIssue.parse_list_issues_from_json(json_issues)
+    return list(map(lambda i: i.problem_id, issues_list))
+
+
+def __get_inspections_from_df(solutions_df: pd.DataFrame) -> List[str]:
+    inspections = solutions_df.apply(lambda row: __get_inspections_ids(row[INSPECTIONS]), axis=1)
+    return list(itertools.chain.from_iterable(inspections.values))
+
+
+def __count_uniq_inspections_in_fragment(json_issues: str, inspection_id_to_fragments: Dict[str, int]) -> None:
+    issues_list = set(__get_inspections_ids(json_issues))
+    for issue in issues_list:
+        inspection_id_to_fragments[issue] += 1
+
+
+def __get_uniq_inspections_in_all_fragments(solutions_df: pd.DataFrame) -> Dict[str, int]:
+    inspection_id_to_fragments: Dict[str, int] = defaultdict(int)
+    solutions_df.apply(lambda row: __count_uniq_inspections_in_fragment(row[INSPECTIONS], inspection_id_to_fragments),
+                       axis=1)
+
+    return inspection_id_to_fragments
+
+
+def __get_all_inspections_by_inspection_id(inspection_id: str, all_inspections: List[str]) -> List[str]:
+    return list(filter(lambda i: i == inspection_id, all_inspections))
+
+
+def __create_unique_inspections_df(inspections: List[str],
+                                   inspection_id_to_fragments: Optional[Dict[str, int]]) -> pd.DataFrame:
+    id_to_inspection = {}
+    for index, inspection in enumerate(set(inspections)):
+        id_to_inspection[index + 1] = inspection
+    inspections_df = pd.DataFrame(id_to_inspection.items(), columns=[ID, INSPECTION_ID])
+    inspections_df[COUNT_ALL] = inspections_df.apply(lambda row: len(__get_all_inspections_by_inspection_id(
+        row[INSPECTION_ID], inspections)), axis=1)
+    if inspection_id_to_fragments is None:
+        inspections_df[COUNT_UNIQUE] = 0
+    else:
+        inspections_df[COUNT_UNIQUE] = inspections_df.apply(lambda row: inspection_id_to_fragments.get(
+            row[INSPECTION_ID], 0), axis=1)
+    return inspections_df
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    configure_arguments(parser)
+    args = parser.parse_args()
+
+    solutions_file_path = args.solutions_file_path
+    solutions_df = get_solutions_df_by_file_path(solutions_file_path)
+    if args.uniq:
+        inspection_id_to_fragments = __get_uniq_inspections_in_all_fragments(solutions_df)
+    else:
+        inspection_id_to_fragments = None
+    inspections_df = __create_unique_inspections_df(__get_inspections_from_df(solutions_df), inspection_id_to_fragments)
+
+    output_path = get_parent_folder(Path(solutions_file_path))
+    write_dataframe_to_csv(output_path / f'inspections{Extension.CSV.value}', inspections_df)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/python/evaluation/qodana/util/models.py b/src/python/evaluation/qodana/util/models.py
index f5b3a589..08ce4c9f 100644
--- a/src/python/evaluation/qodana/util/models.py
+++ b/src/python/evaluation/qodana/util/models.py
@@ -1,6 +1,7 @@
 import json
 from dataclasses import dataclass
 from enum import Enum, unique
+from typing import List
 
 
 @dataclass(frozen=True)
@@ -38,10 +39,18 @@ def from_json(cls, str_json: str) -> 'QodanaIssue':
             problem_id=issue[QodanaJsonField.PROBLEM_ID.value],
         )
 
+    @classmethod
+    def parse_list_issues_from_json(cls, str_json: str) -> List['QodanaIssue']:
+        return list(map(lambda i: QodanaIssue.from_json(i), json.loads(str_json)[QodanaJsonField.ISSUES.value]))
+
 
 @unique
 class QodanaColumnName(Enum):
     INSPECTIONS = 'inspections'
+    ID = 'id'
+    INSPECTION_ID = 'inspection_id'
+    COUNT_ALL = 'count_all'
+    COUNT_UNIQUE = 'count_unique'
 
 
 @unique
diff --git a/src/python/evaluation/qodana/util/util.py b/src/python/evaluation/qodana/util/util.py
index 0c4b8712..3766b09d 100644
--- a/src/python/evaluation/qodana/util/util.py
+++ b/src/python/evaluation/qodana/util/util.py
@@ -1,7 +1,11 @@
+import argparse
 import json
-from typing import List
+from pathlib import Path
+from typing import Dict, List
 
-from src.python.evaluation.qodana.util.models import QodanaIssue, QodanaJsonField
+import pandas as pd
+from src.python.common.tool_arguments import RunToolArgument
+from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue, QodanaJsonField
 
 
 def to_json(issues: List[QodanaIssue]) -> str:
@@ -9,3 +13,39 @@ def to_json(issues: List[QodanaIssue]) -> str:
         QodanaJsonField.ISSUES.value: list(map(lambda i: i.to_json(), issues)),
     }
     return json.dumps(issues_json)
+
+
+# Get a dictionary: Qodana inspection_id -> inspection_id from csv file with two columns: id, inspection_id
+def get_inspections_dict(inspections_path: str) -> Dict[str, int]:
+    inspections_df = pd.read_csv(inspections_path)
+    inspections_dict = inspections_df.set_index(QodanaColumnName.INSPECTION_ID.value).T.to_dict('list')
+    for qodana_id, id_list in inspections_dict.items():
+        inspections_dict[qodana_id] = id_list[0]
+    return inspections_dict
+
+
+def replace_inspections_on_its_ids(issues_list: List[QodanaIssue], inspections_dict: Dict[str, int],
+                                   to_remove_duplicates: bool) -> str:
+    if len(issues_list) == 0:
+        inspections = '0'
+    else:
+        problem_id_list = list(map(lambda i: inspections_dict[i.problem_id], issues_list))
+        if to_remove_duplicates:
+            problem_id_list = list(set(problem_id_list))
+        problem_id_list.sort()
+        inspections = ','.join(str(p) for p in problem_id_list)
+    return inspections
+
+
+def configure_model_converter_arguments(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument(RunToolArgument.QODANA_SOLUTIONS_FILE_PATH.value.long_name,
+                        type=lambda value: Path(value).absolute(),
+                        help=RunToolArgument.QODANA_SOLUTIONS_FILE_PATH.value.description)
+
+    parser.add_argument(RunToolArgument.QODANA_INSPECTIONS_PATH.value.long_name,
+                        type=lambda value: Path(value).absolute(),
+                        help=RunToolArgument.QODANA_INSPECTIONS_PATH.value.description)
+
+    parser.add_argument(RunToolArgument.QODANA_DUPLICATES.value.long_name,
+                        help=RunToolArgument.QODANA_DUPLICATES.value.description,
+                        action='store_true')
diff --git a/whitelist.txt b/whitelist.txt
index e7c8e657..6269ca26 100644
--- a/whitelist.txt
+++ b/whitelist.txt
@@ -115,3 +115,5 @@ groupby
 getuid
 Popen
 datasets
+usecols
+linesep

From e41367803e4c7f7a59b85a5202f1f4a952df7f13 Mon Sep 17 00:00:00 2001
From: Daria Diatlova <dari.diatlova@gmail.com>
Date: Mon, 24 May 2021 21:24:45 +0300
Subject: [PATCH 3/4] Update README.md

---
 src/python/evaluation/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/python/evaluation/README.md b/src/python/evaluation/README.md
index af2dbbd8..f8fafbe0 100644
--- a/src/python/evaluation/README.md
+++ b/src/python/evaluation/README.md
@@ -29,7 +29,7 @@ Optional arguments:
 Argument | Description
 --- | ---
 |**&#8209;f**, **&#8209;&#8209;format**| The output format. Available values: `json`, `text`. The default value is `json` . Use this argument when `traceback` is enabled, otherwise it will not be used.|
-|**&#8209;tp**, **&#8209;&#8209;tool&#8209path**| Path to run-tool. Default is `src/python/review/run_tool.py` .|
+|**&#8209;tp**, **&#8209;&#8209;tool&#8209;path**| Path to run-tool. Default is `src/python/review/run_tool.py` .|
 |**&#8209;&#8209;traceback**| To include a column with errors traceback into an output file. Default is `False`.|
-|**&#8209;ofp**, **&#8209;&#8209;output&#8209folder&#8209path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. |
-|**&#8209;ofn**, **&#8209;&#8209;output&#8209file&#8209name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.|
+|**&#8209;ofp**, **&#8209;&#8209;output&#8209;folder&#8209;path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. |
+|**&#8209;ofn**, **&#8209;&#8209;output&#8209;file&#8209;name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.|

From f1524f80a6f57aeedc841fc83086fb25b4fdab66 Mon Sep 17 00:00:00 2001
From: "Anastasiia.Birillo" <nbirillo@mail.ru>
Date: Mon, 31 May 2021 17:41:13 +0300
Subject: [PATCH 4/4] Resolve conflicts

---
 src/python/evaluation/inspectors/filter_issues.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/python/evaluation/inspectors/filter_issues.py b/src/python/evaluation/inspectors/filter_issues.py
index f5e4537d..e0d7d86b 100644
--- a/src/python/evaluation/inspectors/filter_issues.py
+++ b/src/python/evaluation/inspectors/filter_issues.py
@@ -5,9 +5,8 @@
 import pandas as pd
 from src.python.common.tool_arguments import RunToolArgument
 from src.python.evaluation.common.pandas_util import get_issues_from_json, get_solutions_df_by_file_path
-from src.python.evaluation.common.util import ColumnName, EvaluationArgument
-from src.python.evaluation.inspectors.common.statistics import PenaltyIssue
 from src.python.evaluation.common.util import ColumnName, EvaluationArgument, parse_set_arg
+from src.python.evaluation.inspectors.common.statistics import PenaltyIssue
 from src.python.review.common.file_system import Extension, get_parent_folder, serialize_data_and_write_to_file
 from src.python.review.inspectors.issue import BaseIssue
 
@@ -28,10 +27,6 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None:
                         default='')
 
 
-def __parse_issues_arg(str_issues: str) -> Set[str]:
-    return set(str_issues.split(','))
-
-
 def __get_new_issues(traceback: str, new_issues_classes: Set[str]) -> List[PenaltyIssue]:
     all_issues = get_issues_from_json(traceback)
     return list(filter(lambda i: i.origin_class in new_issues_classes, all_issues))
@@ -61,7 +56,7 @@ def main() -> None:
 
     solutions_file_path = args.solutions_file_path
     solutions_df = get_solutions_df_by_file_path(solutions_file_path)
-    issues = __parse_issues_arg(args.issues)
+    issues = parse_set_arg(args.issues)
 
     diffs = get_statistics_dict(solutions_df, issues)
     output_path = get_parent_folder(Path(solutions_file_path)) / f'diffs{Extension.PICKLE.value}'