hyperskill · nbirillo · May 31, 2021 · May 21, 2021 · May 23, 2021 · May 24, 2021
diff --git a/src/python/common/tool_arguments.py b/src/python/common/tool_arguments.py
@@ -89,3 +89,10 @@ class RunToolArgument(Enum):
 
     DIFFS_FILE_PATH = ArgumentsInfo(None, 'diffs_file_path',
                                     'Path to a file with serialized diffs that were founded by diffs_between_df.py')
+
+    QODANA_SOLUTIONS_FILE_PATH = ArgumentsInfo(None, 'solutions_file_path',
+                                               'Csv file with solutions. This file must be graded by Qodana.')
+
+    QODANA_INSPECTIONS_PATH = ArgumentsInfo(None, 'inspections_path', 'Path to a CSV file with inspections list.')
+
+    QODANA_DUPLICATES = ArgumentsInfo(None, '--remove-duplicates', 'Remove duplicates around inspections')
diff --git a/src/python/evaluation/README.md b/src/python/evaluation/README.md
@@ -29,7 +29,7 @@ Optional arguments:
 Argument | Description
 --- | ---
 |**&#8209;f**, **&#8209;&#8209;format**| The output format. Available values: `json`, `text`. The default value is `json` . Use this argument when `traceback` is enabled, otherwise it will not be used.|
-|**&#8209;tp**, **&#8209;&#8209;tool_path**| Path to run-tool. Default is `src/python/review/run_tool.py` .|
+|**&#8209;tp**, **&#8209;&#8209;tool&#8209;path**| Path to run-tool. Default is `src/python/review/run_tool.py` .|
 |**&#8209;&#8209;traceback**| To include a column with errors traceback into an output file. Default is `False`.|
-|**&#8209;ofp**, **&#8209;&#8209;output_folder_path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. |
-|**&#8209;ofn**, **&#8209;&#8209;output_file_name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.|
+|**&#8209;ofp**, **&#8209;&#8209;output&#8209;folder&#8209;path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. |
+|**&#8209;ofn**, **&#8209;&#8209;output&#8209;file&#8209;name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.|
diff --git a/src/python/evaluation/common/util.py b/src/python/evaluation/common/util.py
@@ -1,4 +1,5 @@
 from enum import Enum, unique
+from typing import Set
 
 from src.python.review.application_config import LanguageVersion
 from src.python.review.common.file_system import Extension
@@ -43,3 +44,8 @@ class EvaluationArgument(Enum):
                          f'Acceptable language-names are: {LanguageVersion.PYTHON_3.value}, '
                          f'{LanguageVersion.JAVA_8.value} ,'
                          f'{LanguageVersion.JAVA_11.value} and {LanguageVersion.KOTLIN.value}.')
+
+
+# Split string by separator
+def parse_set_arg(str_arg: str, separator: str = ',') -> Set[str]:
+    return set(str_arg.split(separator))
diff --git a/src/python/evaluation/inspectors/README.md b/src/python/evaluation/inspectors/README.md
@@ -183,8 +183,8 @@ Optional arguments:
 Argument | Description
 --- | ---
 |**&#8209;&#8209;categorize**| If True, statistics will be categorized by several categories. By default is disabled.|
-|**&#8209;n**, **&#8209;&#8209;top_n**| The top N items will be printed. Default value is 10.|
-|**&#8209;&#8209;full_stat**| If True, full statistics (with all issues) will be printed. By default is disabled.|
+|**&#8209;n**, **&#8209;&#8209;top&#8209;n**| The top N items will be printed. Default value is 10.|
+|**&#8209;&#8209;full&#8209;stat**| If True, full statistics (with all issues) will be printed. By default is disabled.|
 
 The statistics will be printed into console.
 

diff --git a/src/python/evaluation/inspectors/filter_issues.py b/src/python/evaluation/inspectors/filter_issues.py
@@ -5,9 +5,10 @@
 import pandas as pd
 from src.python.common.tool_arguments import RunToolArgument
 from src.python.evaluation.common.pandas_util import get_issues_from_json, get_solutions_df_by_file_path
-from src.python.evaluation.common.util import ColumnName, EvaluationArgument
+from src.python.evaluation.common.util import ColumnName, EvaluationArgument, parse_set_arg
 from src.python.evaluation.inspectors.common.statistics import PenaltyIssue
 from src.python.review.common.file_system import Extension, get_parent_folder, serialize_data_and_write_to_file
+from src.python.review.inspectors.issue import BaseIssue
 
 
 TRACEBACK = EvaluationArgument.TRACEBACK.value
@@ -26,16 +27,12 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None:
                         default='')
 
 
-def __parse_issues_arg(str_issues: str) -> Set[str]:
-    return set(str_issues.split(','))
-
-
 def __get_new_issues(traceback: str, new_issues_classes: Set[str]) -> List[PenaltyIssue]:
     all_issues = get_issues_from_json(traceback)
     return list(filter(lambda i: i.origin_class in new_issues_classes, all_issues))
 
 
-def __add_issues_for_fragment(fragment_id: int, new_issues: List[PenaltyIssue], diffs: dict) -> None:
+def __add_issues_for_fragment(fragment_id: int, new_issues: List[BaseIssue], diffs: dict) -> None:
     if len(new_issues) > 0:
         diffs[TRACEBACK][fragment_id] = new_issues
 
@@ -59,7 +56,7 @@ def main() -> None:
 
     solutions_file_path = args.solutions_file_path
     solutions_df = get_solutions_df_by_file_path(solutions_file_path)
-    issues = __parse_issues_arg(args.issues)
+    issues = parse_set_arg(args.issues)
 
     diffs = get_statistics_dict(solutions_df, issues)
     output_path = get_parent_folder(Path(solutions_file_path)) / f'diffs{Extension.PICKLE.value}'

diff --git a/src/python/evaluation/inspectors/print_inspectors_statistics.py b/src/python/evaluation/inspectors/print_inspectors_statistics.py
@@ -21,12 +21,12 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None:
                         help='If True, statistics will be categorized by several categories.',
                         action='store_true')
 
-    parser.add_argument('-n', '--top_n',
+    parser.add_argument('-n', '--top-n',
                         help='The top N items will be printed',
                         type=int,
                         default=10)
 
-    parser.add_argument('--full_stat',
+    parser.add_argument('--full-stat',
                         help='If True, full statistics will be printed.',
                         action='store_true')
 

diff --git a/src/python/evaluation/qodana/README.md b/src/python/evaluation/qodana/README.md
@@ -19,3 +19,215 @@ Run the [dataset_labeling.py](dataset_labeling.py) with the arguments from comma
 | **&#8209;l**, **&#8209;&#8209;limit** | Allows you to read only the specified number of first rows from the dataset. If no limit is specified, the whole dataset will be processed. |
 | **&#8209;s**, **&#8209;&#8209;chunk&#8209;size** | The number of files that Qodana will process at a time. Default is `5000`. |
 | **&#8209;o**, **&#8209;&#8209;output&#8209;path** | The path where the labeled dataset will be saved. If not specified, the original dataset will be overwritten. |
+
+---
+
+# Postprocessing
+
+The model that imitates Qodana analysis gets input from a dataset in a special format. 
+This module allows preparing datasets that were graded by [dataset_marking.py](dataset_marking.py) script.
+
+Data processing consists of several stages:
+- union several `csv` files that were graded by [dataset_marking.py](dataset_marking.py) script 
+  and filter inspections list if it is necessary;
+- get all unique inspections from the dataset;
+- convert `csv` file into a special format.
+
+## Filter inspections
+
+This stage allow you to union several `csv` files that were graded by [dataset_marking.py](dataset_marking.py) script 
+  and filter inspections list if it is necessary.
+
+Please, note that your all input files must be graded by [dataset_marking.py](dataset_marking.py) script 
+and have `inspections` column.
+
+Output file is a new `csv` file with the all columns from the input files.
+
+#### Usage
+
+Run the [filter_inspections.py](filter_inspections.py) with the arguments from command line.
+
+Required arguments:
+
+`dataset_folder` — path to a folder with csv files graded by Qodana. Each file must have `inspections` column.
+
+Optional arguments:
+Argument | Description
+--- | ---
+|**&#8209;i**, **&#8209;&#8209;inspections**| Set of inspections ids to exclude from the dataset separated by comma. By default all inspections remain. |
+
+The resulting file will be stored in the `dataset_folder`.
+
+___
+
+## Get all unique inspections
+
+This stage allow you to get all unique inspections from a `csv` file graded by Qodana. 
+Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script 
+and has `inspections` column.
+
+Output file is a new `csv` file with four columns: `id`, `inspection_id`, `count_all`, `count_uniq`. 
+`id` is unique number for each inspection, minimal value is 1.
+`inspection_id` is unique Qoadana id for each inspection.
+`count_all` count all fragments where was this inspection (with duplicates).
+`count_uniq` count all fragments where was this inspection (without duplicates).
+
+#### Usage
+
+Run the [get_unique_inspectors.py](get_unique_inspectors.py) with the arguments from command line.
+
+Required arguments:
+
+`solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script.
+
+Optional arguments:
+Argument | Description
+--- | ---
+|**&#8209;&#8209;uniq**| To count all fragments for each inspection where was this inspection (without duplicates). By default it disabled. |
+
+The resulting file will be stored in the same folder as the input file.
+
+An example of the output file:
+
+```json
+id   |  inspection_id      |  count_all   |  count_unique
+-----|---------------------|--------------|--------------
+1    |  SystemOutErr       |    5         |     2
+2    |  ConstantExpression |    1         |     1
+```
+
+___
+
+#### Convert `csv` file into a special format
+
+This block describes what format can be converted csv-file with code samples 
+graded by [dataset_marking.py](dataset_marking.py) script.
+
+We have two different formats:
+- fragment to inspections list;
+- fragment to inspections list with positions.
+
+
+#### Fragment to inspections list
+
+This data representation match code fragments to a list with ids of inspections.
+
+Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script 
+and has `inspections` column.
+
+Output file is a new `csv` file with a new `inspections` column with list with ids of inspections. 
+If the list of inspections for the fragment is empty, then write 0.
+
+#### Usage
+
+Run the [fragment_to_inspections_list.py](fragment_to_inspections_list.py) with the arguments from command line.
+
+Required arguments:
+
+- `solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script,
+- `inspections_path` — path to csv-file with inspections list from the input file. You can get this file by [get_unique_inspectors.py](get_unique_inspectors.py) script.
+
+Optional arguments:
+Argument | Description
+--- | ---
+|**&#8209;&#8209;remove&#8209;duplicates**| Remove duplicates around inspections in each row. Default value is `False`. |
+
+The resulting file will be stored in the same folder as the input file.
+
+An example of the input file:
+
+```json
+id   |  code             |  lang         |  inspections
+-----|-------------------|---------------|-----------------
+2    |  "// some code"   |  java11       |  "{""issues"": []}"
+3    |  "// some code"   |  java11       |  "{""issues"": [""{\"... \""problem_id\"": \""SystemOutErr\""}""]}"
+0    |  "// some code"   |  java11       |  "{""issues"": [""{\"...\""problem_id\"": \""ConstantExpression\""}"",""{\"...\""problem_id\"": \""ConstantExpression\""}""]}"
+1    |  "// some code"   |  java11       |  "{""issues"": []}"
+```
+
+with the inspections file: 
+
+```json
+id   |  inspection_id    
+-----|-------------------
+1    |  SystemOutErr   
+2    |  ConstantExpression
+```
+
+An example of the output file:
+
+```json
+id   |  code             |  lang         |  inspections
+-----|-------------------|---------------|-----------------
+2    |  "// some code"   |  java11       |  0
+3    |  "// some code"   |  java11       |  1
+0    |  "// some code"   |  java11       |  2,2
+1    |  "// some code"   |  java11       |  0
+
+```
+
+---
+
+#### Fragment to inspections list with positions
+
+This data representation match each line in code fragments to a list with ids of inspections in this line.
+
+Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script 
+and has `inspections` column.
+
+Output file is a new `csv` file with a new `inspections` column with list with ids of inspections. 
+If the list of inspections for the fragment is empty, then write 0. 
+Note, that each line in code fragments in the new file is stored in a separate row. 
+All indents as well as blank lines are keeped.
+
+#### Usage
+
+Run the [fragment_to_inspections_list_line_by_line.py](fragment_to_inspections_list_line_by_line.py) with the arguments from command line.
+
+Required arguments:
+
+- `solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script,
+- `inspections_path` — path to csv-file with inspections list from the input file. You can get this file by [get_unique_inspectors.py](get_unique_inspectors.py) script.
+
+Optional arguments:
+Argument | Description
+--- | ---
+|**&#8209;&#8209;remove&#8209;duplicates**| Remove duplicates around inspections in each row. Default value is `False`. |
+
+The resulting file will be stored in the same folder as the input file.
+
+An example of the input file:
+
+```json
+id   |  code             |  lang         |  inspections
+-----|-------------------|---------------|-----------------
+2    |  "// some code"   |  java11       |  "{""issues"": []}"
+3    |  "// some code"   |  java11       |  "{""issues"": [""{\"... \""problem_id\"": \""SystemOutErr\""}""]}"
+0    |  "// some code"   |  java11       |  "{""issues"": [""{\"...\""problem_id\"": \""ConstantExpression\""}"",""{\"...\""problem_id\"": \""ConstantExpression\""}""]}"
+1    |  "// some code"   |  java11       |  "{""issues"": []}"
+```
+
+with the inspections file: 
+
+```json
+id   |  inspection_id    
+-----|-------------------
+1    |  SystemOutErr   
+2    |  ConstantExpression
+```
+
+An example of the output file:
+
+```json
+id   |  code                                  |  lang         |  inspections
+-----|----------------------------------------|---------------|-----------------
+2    |  "// first line from code with id 2"   |  java11       |  0
+2    |  "// second line from code with id 2"  |  java11       |  0
+3    |  "// first line from code with id 3"   |  java11       |  1
+3    |  "// second line from code with id 3"  |  java11       |  0
+0    |  "// first line from code with id 0"   |  java11       |  0
+0    |  "// second line from code with id 0"  |  java11       |  2,2
+1    |  "// first line from code with id 1"   |  java11       |  0
+1    |  "// second line from code with id 1"  |  java11       |  0
+
+```
diff --git a/src/python/evaluation/qodana/dataset_labeling.py b/src/python/evaluation/qodana/dataset_labeling.py
@@ -16,7 +16,8 @@
 import pandas as pd
 from src.python.evaluation.common.csv_util import write_dataframe_to_csv
 from src.python.evaluation.common.util import ColumnName
-from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue, QodanaJsonField
+from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue
+from src.python.evaluation.qodana.util.util import to_json
 from src.python.review.application_config import LanguageVersion
 from src.python.review.common.file_system import (
     copy_directory,
@@ -186,13 +187,6 @@ def _parse_inspections_files(cls, inspections_files: List[Path]) -> Dict[int, Li
                 id_to_issues[fragment_id].append(qodana_issue)
         return id_to_issues
 
-    @classmethod
-    def _to_json(cls, issues: List[QodanaIssue]) -> str:
-        issues_json = {
-            QodanaJsonField.ISSUES.value: list(map(lambda i: i.to_json(), issues)),
-        }
-        return json.dumps(issues_json)
-
     def _label_chunk(self, chunk: pd.DataFrame, language: LanguageVersion, chunk_id: int) -> pd.DataFrame:
         tmp_dir_path = self.dataset_path.parent.absolute() / f'qodana_project_{chunk_id}'
         create_directory(tmp_dir_path)
@@ -219,7 +213,7 @@ def _label_chunk(self, chunk: pd.DataFrame, language: LanguageVersion, chunk_id:
 
         logger.info('Write inspections')
         chunk[QodanaColumnName.INSPECTIONS.value] = chunk.apply(
-            lambda row: self._to_json(inspections.get(row[ColumnName.ID.value], [])), axis=1,
+            lambda row: to_json(inspections.get(row[ColumnName.ID.value], [])), axis=1,
         )
 
         remove_directory(tmp_dir_path)