Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/python/common/tool_arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,10 @@ class RunToolArgument(Enum):

DIFFS_FILE_PATH = ArgumentsInfo(None, 'diffs_file_path',
'Path to a file with serialized diffs that were founded by diffs_between_df.py')

QODANA_SOLUTIONS_FILE_PATH = ArgumentsInfo(None, 'solutions_file_path',
'Csv file with solutions. This file must be graded by Qodana.')

QODANA_INSPECTIONS_PATH = ArgumentsInfo(None, 'inspections_path', 'Path to a CSV file with inspections list.')

QODANA_DUPLICATES = ArgumentsInfo(None, '--remove-duplicates', 'Remove duplicates around inspections')
6 changes: 3 additions & 3 deletions src/python/evaluation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Optional arguments:
Argument | Description
--- | ---
|**‑f**, **‑‑format**| The output format. Available values: `json`, `text`. The default value is `json` . Use this argument when `traceback` is enabled, otherwise it will not be used.|
|**‑tp**, **‑‑tool_path**| Path to run-tool. Default is `src/python/review/run_tool.py` .|
|**‑tp**, **‑‑tool&#8209path**| Path to run-tool. Default is `src/python/review/run_tool.py` .|
|**‑‑traceback**| To include a column with errors traceback into an output file. Default is `False`.|
|**‑ofp**, **‑‑output_folder_path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. |
|**‑ofn**, **‑‑output_file_name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.|
|**‑ofp**, **‑‑output&#8209folder&#8209path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. |
|**‑ofn**, **‑‑output&#8209file&#8209name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.|
4 changes: 2 additions & 2 deletions src/python/evaluation/inspectors/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,8 @@ Optional arguments:
Argument | Description
--- | ---
|**‑‑categorize**| If True, statistics will be categorized by several categories. By default is disabled.|
|**‑n**, **‑‑top_n**| The top N items will be printed. Default value is 10.|
|**‑‑full_stat**| If True, full statistics (with all issues) will be printed. By default is disabled.|
|**‑n**, **‑‑top‑n**| The top N items will be printed. Default value is 10.|
|**‑‑full‑stat**| If True, full statistics (with all issues) will be printed. By default is disabled.|

The statistics will be printed into console.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None:
help='If True, statistics will be categorized by several categories.',
action='store_true')

parser.add_argument('-n', '--top_n',
parser.add_argument('-n', '--top-n',
help='The top N items will be printed',
type=int,
default=10)

parser.add_argument('--full_stat',
parser.add_argument('--full-stat',
help='If True, full statistics will be printed.',
action='store_true')

Expand Down
175 changes: 175 additions & 0 deletions src/python/evaluation/qodana/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ This module allows preparing datasets that were graded by [dataset_marking.py](d
Data processing consists of several stages:
- union several `csv` files that were graded by [dataset_marking.py](dataset_marking.py) script
and filter inspections list if it is necessary;
- get all unique inspections from the dataset;
- convert `csv` file into a special format.

## Filter inspections
Expand All @@ -59,3 +60,177 @@ Argument | Description
|**‑i**, **‑‑inspections**| Set of inspections ids to exclude from the dataset separated by comma. By default all inspections remain. |

The resulting file will be stored in the `dataset_folder`.

___

## Get all unique inspections

This stage allow you to get all unique inspections from a `csv` file graded by Qodana.
Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script
and has `inspections` column.

Output file is a new `csv` file with four columns: `id`, `inspection_id`, `count_all`, `count_uniq`.
`id` is unique number for each inspection, minimal value is 1.
`inspection_id` is unique Qoadana id for each inspection.
`count_all` count all fragments where was this inspection (with duplicates).
`count_uniq` count all fragments where was this inspection (without duplicates).

#### Usage

Run the [get_unique_inspectors.py](get_unique_inspectors.py) with the arguments from command line.

Required arguments:

`solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script.

Optional arguments:
Argument | Description
--- | ---
|**‑‑uniq**| To count all fragments for each inspection where was this inspection (without duplicates). By default it disabled. |

The resulting file will be stored in the same folder as the input file.

An example of the output file:

```json
id | inspection_id | count_all | count_unique
-----|---------------------|--------------|--------------
1 | SystemOutErr | 5 | 2
2 | ConstantExpression | 1 | 1
```

___

#### Convert `csv` file into a special format

This block describes what format can be converted csv-file with code samples
graded by [dataset_marking.py](dataset_marking.py) script.

We have two different formats:
- fragment to inspections list;
- fragment to inspections list with positions.


#### Fragment to inspections list

This data representation match code fragments to a list with ids of inspections.

Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script
and has `inspections` column.

Output file is a new `csv` file with a new `inspections` column with list with ids of inspections.
If the list of inspections for the fragment is empty, then write 0.

#### Usage

Run the [fragment_to_inspections_list.py](fragment_to_inspections_list.py) with the arguments from command line.

Required arguments:

- `solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script,
- `inspections_path` — path to csv-file with inspections list from the input file. You can get this file by [get_unique_inspectors.py](get_unique_inspectors.py) script.

Optional arguments:
Argument | Description
--- | ---
|**‑‑remove‑duplicates**| Remove duplicates around inspections in each row. Default value is `False`. |

The resulting file will be stored in the same folder as the input file.

An example of the input file:

```json
id | code | lang | inspections
-----|-------------------|---------------|-----------------
2 | "// some code" | java11 | "{""issues"": []}"
3 | "// some code" | java11 | "{""issues"": [""{\"... \""problem_id\"": \""SystemOutErr\""}""]}"
0 | "// some code" | java11 | "{""issues"": [""{\"...\""problem_id\"": \""ConstantExpression\""}"",""{\"...\""problem_id\"": \""ConstantExpression\""}""]}"
1 | "// some code" | java11 | "{""issues"": []}"
```

with the inspections file:

```json
id | inspection_id
-----|-------------------
1 | SystemOutErr
2 | ConstantExpression
```

An example of the output file:

```json
id | code | lang | inspections
-----|-------------------|---------------|-----------------
2 | "// some code" | java11 | 0
3 | "// some code" | java11 | 1
0 | "// some code" | java11 | 2,2
1 | "// some code" | java11 | 0

```

---

#### Fragment to inspections list with positions

This data representation match each line in code fragments to a list with ids of inspections in this line.

Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script
and has `inspections` column.

Output file is a new `csv` file with a new `inspections` column with list with ids of inspections.
If the list of inspections for the fragment is empty, then write 0.
Note, that each line in code fragments in the new file is stored in a separate row.
All indents as well as blank lines are keeped.

#### Usage

Run the [fragment_to_inspections_list_line_by_line.py](fragment_to_inspections_list_line_by_line.py) with the arguments from command line.

Required arguments:

- `solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script,
- `inspections_path` — path to csv-file with inspections list from the input file. You can get this file by [get_unique_inspectors.py](get_unique_inspectors.py) script.

Optional arguments:
Argument | Description
--- | ---
|**‑‑remove‑duplicates**| Remove duplicates around inspections in each row. Default value is `False`. |

The resulting file will be stored in the same folder as the input file.

An example of the input file:

```json
id | code | lang | inspections
-----|-------------------|---------------|-----------------
2 | "// some code" | java11 | "{""issues"": []}"
3 | "// some code" | java11 | "{""issues"": [""{\"... \""problem_id\"": \""SystemOutErr\""}""]}"
0 | "// some code" | java11 | "{""issues"": [""{\"...\""problem_id\"": \""ConstantExpression\""}"",""{\"...\""problem_id\"": \""ConstantExpression\""}""]}"
1 | "// some code" | java11 | "{""issues"": []}"
```

with the inspections file:

```json
id | inspection_id
-----|-------------------
1 | SystemOutErr
2 | ConstantExpression
```

An example of the output file:

```json
id | code | lang | inspections
-----|----------------------------------------|---------------|-----------------
2 | "// first line from code with id 2" | java11 | 0
2 | "// second line from code with id 2" | java11 | 0
3 | "// first line from code with id 3" | java11 | 1
3 | "// second line from code with id 3" | java11 | 0
0 | "// first line from code with id 0" | java11 | 0
0 | "// second line from code with id 0" | java11 | 2,2
1 | "// first line from code with id 1" | java11 | 0
1 | "// second line from code with id 1" | java11 | 0

```
8 changes: 3 additions & 5 deletions src/python/evaluation/qodana/filter_inspections.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import argparse
import json
from pathlib import Path
from typing import List

import pandas as pd
from src.python.evaluation.common.csv_util import write_dataframe_to_csv
from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path
from src.python.evaluation.common.util import parse_set_arg
from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue, QodanaJsonField
from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue
from src.python.evaluation.qodana.util.util import to_json
from src.python.review.common.file_system import Extension, extension_file_condition, get_all_file_system_items

Expand Down Expand Up @@ -35,9 +34,8 @@ def __get_qodana_dataset(root: Path) -> pd.DataFrame:


def __filter_inspections(json_issues: str, inspections_to_keep: List[str]) -> str:
issues_list = json.loads(json_issues)[QodanaJsonField.ISSUES.value]
filtered_issues = list(filter(lambda i: i.problem_id not in inspections_to_keep,
map(lambda i: QodanaIssue.from_json(i), issues_list)))
issues_list = QodanaIssue.parse_list_issues_from_json(json_issues)
filtered_issues = list(filter(lambda i: i.problem_id not in inspections_to_keep, issues_list))
return to_json(filtered_issues)


Expand Down
33 changes: 33 additions & 0 deletions src/python/evaluation/qodana/fragment_to_inspections_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import argparse
from pathlib import Path

from src.python.evaluation.common.csv_util import write_dataframe_to_csv
from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path
from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue
from src.python.evaluation.qodana.util.util import (
configure_model_converter_arguments, get_inspections_dict, replace_inspections_on_its_ids,
)
from src.python.review.common.file_system import Extension, get_parent_folder

INSPECTIONS = QodanaColumnName.INSPECTIONS.value


def main() -> None:
parser = argparse.ArgumentParser()
configure_model_converter_arguments(parser)
args = parser.parse_args()

solutions_file_path = args.solutions_file_path
solutions_df = get_solutions_df_by_file_path(solutions_file_path)
inspections_dict = get_inspections_dict(args.inspections_path)

solutions_df[INSPECTIONS] = solutions_df.apply(
lambda row: replace_inspections_on_its_ids(QodanaIssue.parse_list_issues_from_json(row[INSPECTIONS]),
inspections_dict, args.remove_duplicates), axis=1)

output_path = get_parent_folder(Path(solutions_file_path))
write_dataframe_to_csv(output_path / f'numbered_ids{Extension.CSV.value}', solutions_df)


if __name__ == '__main__':
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import argparse
import os
from itertools import groupby
from pathlib import Path
from typing import Dict, List

import pandas as pd
from src.python.evaluation.common.csv_util import write_dataframe_to_csv
from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path
from src.python.evaluation.common.util import ColumnName
from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue
from src.python.evaluation.qodana.util.util import (
configure_model_converter_arguments, get_inspections_dict, replace_inspections_on_its_ids,
)
from src.python.review.common.file_system import Extension, get_parent_folder


INSPECTIONS = QodanaColumnName.INSPECTIONS.value
CODE = ColumnName.CODE.value


# Make a new dataframe where code fragment is separated line by line and inspections are grouped line by line
def __replace_inspections_to_its_ids_in_row(row: pd.Series, inspections_dict: Dict[str, int],
to_remove_duplicates: bool) -> pd.DataFrame:
row_df = pd.DataFrame(row).transpose()
fragment_lines = row_df.iloc[0][CODE].split(os.linesep)
fragment_df = row_df.loc[row_df.index.repeat(len(fragment_lines))].reset_index(drop=True)

issues_list = QodanaIssue.parse_list_issues_from_json(row_df.iloc[0][INSPECTIONS])
line_number_to_issues = {k: list(v) for k, v in groupby(issues_list, key=lambda i: i.line)}
for index, fragment_line in enumerate(fragment_lines):
issues = line_number_to_issues.get(index + 1, [])
fragment_df.iloc[index][CODE] = fragment_line
fragment_df.iloc[index][INSPECTIONS] = replace_inspections_on_its_ids(issues, inspections_dict,
to_remove_duplicates)
return fragment_df


def __append_df(df: pd.DataFrame, df_list: List[pd.DataFrame]) -> None:
df_list.append(df)


def main() -> None:
parser = argparse.ArgumentParser()
configure_model_converter_arguments(parser)
args = parser.parse_args()

solutions_file_path = args.solutions_file_path
solutions_df = get_solutions_df_by_file_path(solutions_file_path)
inspections_dict = get_inspections_dict(args.inspections_path)

fragment_df_list = []
solutions_df.apply(
lambda row: __append_df(__replace_inspections_to_its_ids_in_row(row, inspections_dict, args.remove_duplicates),
fragment_df_list), axis=1)

output_path = get_parent_folder(Path(solutions_file_path))
write_dataframe_to_csv(output_path / f'numbered_ids_line_by_line{Extension.CSV.value}', pd.concat(fragment_df_list))


if __name__ == '__main__':
main()
Loading