hyperskill · nbirillo · May 24, 2021 · May 21, 2021 · May 21, 2021 · May 21, 2021
diff --git a/src/python/common/tool_arguments.py b/src/python/common/tool_arguments.py
@@ -89,3 +89,10 @@ class RunToolArgument(Enum):
 
     DIFFS_FILE_PATH = ArgumentsInfo(None, 'diffs_file_path',
                                     'Path to a file with serialized diffs that were founded by diffs_between_df.py')
+
+    QODANA_SOLUTIONS_FILE_PATH = ArgumentsInfo(None, 'solutions_file_path',
+                                               'Csv file with solutions. This file must be graded by Qodana.')
+
+    QODANA_INSPECTIONS_PATH = ArgumentsInfo(None, 'inspections_path', 'Path to a CSV file with inspections list.')
+
+    QODANA_DUPLICATES = ArgumentsInfo(None, '--remove-duplicates', 'Remove duplicates around inspections')
diff --git a/src/python/evaluation/README.md b/src/python/evaluation/README.md
@@ -29,7 +29,7 @@ Optional arguments:
 Argument | Description
 --- | ---
 |**&#8209;f**, **&#8209;&#8209;format**| The output format. Available values: `json`, `text`. The default value is `json` . Use this argument when `traceback` is enabled, otherwise it will not be used.|
-|**&#8209;tp**, **&#8209;&#8209;tool_path**| Path to run-tool. Default is `src/python/review/run_tool.py` .|
+|**&#8209;tp**, **&#8209;&#8209;tool&#8209path**| Path to run-tool. Default is `src/python/review/run_tool.py` .|
 |**&#8209;&#8209;traceback**| To include a column with errors traceback into an output file. Default is `False`.|
-|**&#8209;ofp**, **&#8209;&#8209;output_folder_path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. |
-|**&#8209;ofn**, **&#8209;&#8209;output_file_name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.|
+|**&#8209;ofp**, **&#8209;&#8209;output&#8209folder&#8209path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. |
+|**&#8209;ofn**, **&#8209;&#8209;output&#8209file&#8209name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.|
diff --git a/src/python/evaluation/inspectors/README.md b/src/python/evaluation/inspectors/README.md
@@ -161,8 +161,8 @@ Optional arguments:
 Argument | Description
 --- | ---
 |**&#8209;&#8209;categorize**| If True, statistics will be categorized by several categories. By default is disabled.|
-|**&#8209;n**, **&#8209;&#8209;top_n**| The top N items will be printed. Default value is 10.|
-|**&#8209;&#8209;full_stat**| If True, full statistics (with all issues) will be printed. By default is disabled.|
+|**&#8209;n**, **&#8209;&#8209;top&#8209;n**| The top N items will be printed. Default value is 10.|
+|**&#8209;&#8209;full&#8209;stat**| If True, full statistics (with all issues) will be printed. By default is disabled.|
 
 The statistics will be printed into console.
 

diff --git a/src/python/evaluation/inspectors/print_inspectors_statistics.py b/src/python/evaluation/inspectors/print_inspectors_statistics.py
@@ -19,12 +19,12 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None:
                         help='If True, statistics will be categorized by several categories.',
                         action='store_true')
 
-    parser.add_argument('-n', '--top_n',
+    parser.add_argument('-n', '--top-n',
                         help='The top N items will be printed',
                         type=int,
                         default=10)
 
-    parser.add_argument('--full_stat',
+    parser.add_argument('--full-stat',
                         help='If True, full statistics will be printed.',
                         action='store_true')
 

diff --git a/src/python/evaluation/qodana/README.md b/src/python/evaluation/qodana/README.md
@@ -33,6 +33,7 @@ This module allows preparing datasets that were graded by [dataset_marking.py](d
 Data processing consists of several stages:
 - union several `csv` files that were graded by [dataset_marking.py](dataset_marking.py) script 
   and filter inspections list if it is necessary;
+- get all unique inspections from the dataset;
 - convert `csv` file into a special format.
 
 ## Filter inspections
@@ -59,3 +60,177 @@ Argument | Description
 |**&#8209;i**, **&#8209;&#8209;inspections**| Set of inspections ids to exclude from the dataset separated by comma. By default all inspections remain. |
 
 The resulting file will be stored in the `dataset_folder`.
+
+___
+
+## Get all unique inspections
+
+This stage allow you to get all unique inspections from a `csv` file graded by Qodana. 
+Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script 
+and has `inspections` column.
+
+Output file is a new `csv` file with four columns: `id`, `inspection_id`, `count_all`, `count_uniq`. 
+`id` is unique number for each inspection, minimal value is 1.
+`inspection_id` is unique Qoadana id for each inspection.
+`count_all` count all fragments where was this inspection (with duplicates).
+`count_uniq` count all fragments where was this inspection (without duplicates).
+
+#### Usage
+
+Run the [get_unique_inspectors.py](get_unique_inspectors.py) with the arguments from command line.
+
+Required arguments:
+
+`solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script.
+
+Optional arguments:
+Argument | Description
+--- | ---
+|**&#8209;&#8209;uniq**| To count all fragments for each inspection where was this inspection (without duplicates). By default it disabled. |
+
+The resulting file will be stored in the same folder as the input file.
+
+An example of the output file:
+
+```json
+id   |  inspection_id      |  count_all   |  count_unique
+-----|---------------------|--------------|--------------
+1    |  SystemOutErr       |    5         |     2
+2    |  ConstantExpression |    1         |     1
+```
+
+___
+
+#### Convert `csv` file into a special format
+
+This block describes what format can be converted csv-file with code samples 
+graded by [dataset_marking.py](dataset_marking.py) script.
+
+We have two different formats:
+- fragment to inspections list;
+- fragment to inspections list with positions.
+
+
+#### Fragment to inspections list
+
+This data representation match code fragments to a list with ids of inspections.
+
+Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script 
+and has `inspections` column.
+
+Output file is a new `csv` file with a new `inspections` column with list with ids of inspections. 
+If the list of inspections for the fragment is empty, then write 0.
+
+#### Usage
+
+Run the [fragment_to_inspections_list.py](fragment_to_inspections_list.py) with the arguments from command line.
+
+Required arguments:
+
+- `solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script,
+- `inspections_path` — path to csv-file with inspections list from the input file. You can get this file by [get_unique_inspectors.py](get_unique_inspectors.py) script.
+
+Optional arguments:
+Argument | Description
+--- | ---
+|**&#8209;&#8209;remove&#8209;duplicates**| Remove duplicates around inspections in each row. Default value is `False`. |
+
+The resulting file will be stored in the same folder as the input file.
+
+An example of the input file:
+
+```json
+id   |  code             |  lang         |  inspections
+-----|-------------------|---------------|-----------------
+2    |  "// some code"   |  java11       |  "{""issues"": []}"
+3    |  "// some code"   |  java11       |  "{""issues"": [""{\"... \""problem_id\"": \""SystemOutErr\""}""]}"
+0    |  "// some code"   |  java11       |  "{""issues"": [""{\"...\""problem_id\"": \""ConstantExpression\""}"",""{\"...\""problem_id\"": \""ConstantExpression\""}""]}"
+1    |  "// some code"   |  java11       |  "{""issues"": []}"
+```
+
+with the inspections file: 
+
+```json
+id   |  inspection_id    
+-----|-------------------
+1    |  SystemOutErr   
+2    |  ConstantExpression
+```
+
+An example of the output file:
+
+```json
+id   |  code             |  lang         |  inspections
+-----|-------------------|---------------|-----------------
+2    |  "// some code"   |  java11       |  0
+3    |  "// some code"   |  java11       |  1
+0    |  "// some code"   |  java11       |  2,2
+1    |  "// some code"   |  java11       |  0
+
+```
+
+---
+
+#### Fragment to inspections list with positions
+
+This data representation match each line in code fragments to a list with ids of inspections in this line.
+
+Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script 
+and has `inspections` column.
+
+Output file is a new `csv` file with a new `inspections` column with list with ids of inspections. 
+If the list of inspections for the fragment is empty, then write 0. 
+Note, that each line in code fragments in the new file is stored in a separate row. 
+All indents as well as blank lines are keeped.
+
+#### Usage
+
+Run the [fragment_to_inspections_list_line_by_line.py](fragment_to_inspections_list_line_by_line.py) with the arguments from command line.
+
+Required arguments:
+
+- `solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script,
+- `inspections_path` — path to csv-file with inspections list from the input file. You can get this file by [get_unique_inspectors.py](get_unique_inspectors.py) script.
+
+Optional arguments:
+Argument | Description
+--- | ---
+|**&#8209;&#8209;remove&#8209;duplicates**| Remove duplicates around inspections in each row. Default value is `False`. |
+
+The resulting file will be stored in the same folder as the input file.
+
+An example of the input file:
+
+```json
+id   |  code             |  lang         |  inspections
+-----|-------------------|---------------|-----------------
+2    |  "// some code"   |  java11       |  "{""issues"": []}"
+3    |  "// some code"   |  java11       |  "{""issues"": [""{\"... \""problem_id\"": \""SystemOutErr\""}""]}"
+0    |  "// some code"   |  java11       |  "{""issues"": [""{\"...\""problem_id\"": \""ConstantExpression\""}"",""{\"...\""problem_id\"": \""ConstantExpression\""}""]}"
+1    |  "// some code"   |  java11       |  "{""issues"": []}"
+```
+
+with the inspections file: 
+
+```json
+id   |  inspection_id    
+-----|-------------------
+1    |  SystemOutErr   
+2    |  ConstantExpression
+```
+
+An example of the output file:
+
+```json
+id   |  code                                  |  lang         |  inspections
+-----|----------------------------------------|---------------|-----------------
+2    |  "// first line from code with id 2"   |  java11       |  0
+2    |  "// second line from code with id 2"  |  java11       |  0
+3    |  "// first line from code with id 3"   |  java11       |  1
+3    |  "// second line from code with id 3"  |  java11       |  0
+0    |  "// first line from code with id 0"   |  java11       |  0
+0    |  "// second line from code with id 0"  |  java11       |  2,2
+1    |  "// first line from code with id 1"   |  java11       |  0
+1    |  "// second line from code with id 1"  |  java11       |  0
+
+```
diff --git a/src/python/evaluation/qodana/filter_inspections.py b/src/python/evaluation/qodana/filter_inspections.py
@@ -1,13 +1,12 @@
 import argparse
-import json
 from pathlib import Path
 from typing import List
 
 import pandas as pd
 from src.python.evaluation.common.csv_util import write_dataframe_to_csv
 from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path
 from src.python.evaluation.common.util import parse_set_arg
-from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue, QodanaJsonField
+from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue
 from src.python.evaluation.qodana.util.util import to_json
 from src.python.review.common.file_system import Extension, extension_file_condition, get_all_file_system_items
 
@@ -35,9 +34,8 @@ def __get_qodana_dataset(root: Path) -> pd.DataFrame:
 
 
 def __filter_inspections(json_issues: str, inspections_to_keep: List[str]) -> str:
-    issues_list = json.loads(json_issues)[QodanaJsonField.ISSUES.value]
-    filtered_issues = list(filter(lambda i: i.problem_id not in inspections_to_keep,
-                                  map(lambda i: QodanaIssue.from_json(i), issues_list)))
+    issues_list = QodanaIssue.parse_list_issues_from_json(json_issues)
+    filtered_issues = list(filter(lambda i: i.problem_id not in inspections_to_keep, issues_list))
     return to_json(filtered_issues)
 
 

diff --git a/src/python/evaluation/qodana/fragment_to_inspections_list.py b/src/python/evaluation/qodana/fragment_to_inspections_list.py
@@ -0,0 +1,33 @@
+import argparse
+from pathlib import Path
+
+from src.python.evaluation.common.csv_util import write_dataframe_to_csv
+from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path
+from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue
+from src.python.evaluation.qodana.util.util import (
+    configure_model_converter_arguments, get_inspections_dict, replace_inspections_on_its_ids,
+)
+from src.python.review.common.file_system import Extension, get_parent_folder
+
+INSPECTIONS = QodanaColumnName.INSPECTIONS.value
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    configure_model_converter_arguments(parser)
+    args = parser.parse_args()
+
+    solutions_file_path = args.solutions_file_path
+    solutions_df = get_solutions_df_by_file_path(solutions_file_path)
+    inspections_dict = get_inspections_dict(args.inspections_path)
+
+    solutions_df[INSPECTIONS] = solutions_df.apply(
+        lambda row: replace_inspections_on_its_ids(QodanaIssue.parse_list_issues_from_json(row[INSPECTIONS]),
+                                                   inspections_dict, args.remove_duplicates), axis=1)
+
+    output_path = get_parent_folder(Path(solutions_file_path))
+    write_dataframe_to_csv(output_path / f'numbered_ids{Extension.CSV.value}', solutions_df)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/python/evaluation/qodana/fragment_to_inspections_list_line_by_line.py b/src/python/evaluation/qodana/fragment_to_inspections_list_line_by_line.py
@@ -0,0 +1,62 @@
+import argparse
+import os
+from itertools import groupby
+from pathlib import Path
+from typing import Dict, List
+
+import pandas as pd
+from src.python.evaluation.common.csv_util import write_dataframe_to_csv
+from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path
+from src.python.evaluation.common.util import ColumnName
+from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue
+from src.python.evaluation.qodana.util.util import (
+    configure_model_converter_arguments, get_inspections_dict, replace_inspections_on_its_ids,
+)
+from src.python.review.common.file_system import Extension, get_parent_folder
+
+
+INSPECTIONS = QodanaColumnName.INSPECTIONS.value
+CODE = ColumnName.CODE.value
+
+
+# Make a new dataframe where code fragment is separated line by line and inspections are grouped line by line
+def __replace_inspections_to_its_ids_in_row(row: pd.Series, inspections_dict: Dict[str, int],
+                                            to_remove_duplicates: bool) -> pd.DataFrame:
+    row_df = pd.DataFrame(row).transpose()
+    fragment_lines = row_df.iloc[0][CODE].split(os.linesep)
+    fragment_df = row_df.loc[row_df.index.repeat(len(fragment_lines))].reset_index(drop=True)
+
+    issues_list = QodanaIssue.parse_list_issues_from_json(row_df.iloc[0][INSPECTIONS])
+    line_number_to_issues = {k: list(v) for k, v in groupby(issues_list, key=lambda i: i.line)}
+    for index, fragment_line in enumerate(fragment_lines):
+        issues = line_number_to_issues.get(index + 1, [])
+        fragment_df.iloc[index][CODE] = fragment_line
+        fragment_df.iloc[index][INSPECTIONS] = replace_inspections_on_its_ids(issues, inspections_dict,
+                                                                              to_remove_duplicates)
+    return fragment_df
+
+
+def __append_df(df: pd.DataFrame, df_list: List[pd.DataFrame]) -> None:
+    df_list.append(df)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    configure_model_converter_arguments(parser)
+    args = parser.parse_args()
+
+    solutions_file_path = args.solutions_file_path
+    solutions_df = get_solutions_df_by_file_path(solutions_file_path)
+    inspections_dict = get_inspections_dict(args.inspections_path)
+
+    fragment_df_list = []
+    solutions_df.apply(
+        lambda row: __append_df(__replace_inspections_to_its_ids_in_row(row, inspections_dict, args.remove_duplicates),
+                                fragment_df_list), axis=1)
+
+    output_path = get_parent_folder(Path(solutions_file_path))
+    write_dataframe_to_csv(output_path / f'numbered_ids_line_by_line{Extension.CSV.value}', pd.concat(fragment_df_list))
+
+
+if __name__ == '__main__':
+    main()