Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/python/common/tool_arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,10 @@ class RunToolArgument(Enum):

DIFFS_FILE_PATH = ArgumentsInfo(None, 'diffs_file_path',
'Path to a file with serialized diffs that were founded by diffs_between_df.py')

QODANA_SOLUTIONS_FILE_PATH = ArgumentsInfo(None, 'solutions_file_path',
'Csv file with solutions. This file must be graded by Qodana.')

QODANA_INSPECTIONS_PATH = ArgumentsInfo(None, 'inspections_path', 'Path to a CSV file with inspections list.')

QODANA_DUPLICATES = ArgumentsInfo(None, '--remove-duplicates', 'Remove duplicates around inspections')
6 changes: 3 additions & 3 deletions src/python/evaluation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Optional arguments:
Argument | Description
--- | ---
|**‑f**, **‑‑format**| The output format. Available values: `json`, `text`. The default value is `json` . Use this argument when `traceback` is enabled, otherwise it will not be used.|
|**‑tp**, **‑‑tool_path**| Path to run-tool. Default is `src/python/review/run_tool.py` .|
|**‑tp**, **‑‑tool‑path**| Path to run-tool. Default is `src/python/review/run_tool.py` .|
|**‑‑traceback**| To include a column with errors traceback into an output file. Default is `False`.|
|**‑ofp**, **‑‑output_folder_path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. |
|**‑ofn**, **‑‑output_file_name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.|
|**‑ofp**, **‑‑output‑folder‑path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. |
|**‑ofn**, **‑‑output‑file‑name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.|
6 changes: 6 additions & 0 deletions src/python/evaluation/common/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from enum import Enum, unique
from typing import Set

from src.python.review.application_config import LanguageVersion
from src.python.review.common.file_system import Extension
Expand Down Expand Up @@ -43,3 +44,8 @@ class EvaluationArgument(Enum):
f'Acceptable language-names are: {LanguageVersion.PYTHON_3.value}, '
f'{LanguageVersion.JAVA_8.value} ,'
f'{LanguageVersion.JAVA_11.value} and {LanguageVersion.KOTLIN.value}.')


# Split string by separator
def parse_set_arg(str_arg: str, separator: str = ',') -> Set[str]:
return set(str_arg.split(separator))
4 changes: 2 additions & 2 deletions src/python/evaluation/inspectors/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,8 +183,8 @@ Optional arguments:
Argument | Description
--- | ---
|**‑‑categorize**| If True, statistics will be categorized by several categories. By default is disabled.|
|**‑n**, **‑‑top_n**| The top N items will be printed. Default value is 10.|
|**‑‑full_stat**| If True, full statistics (with all issues) will be printed. By default is disabled.|
|**‑n**, **‑‑top‑n**| The top N items will be printed. Default value is 10.|
|**‑‑full‑stat**| If True, full statistics (with all issues) will be printed. By default is disabled.|

The statistics will be printed into console.

Expand Down
11 changes: 4 additions & 7 deletions src/python/evaluation/inspectors/filter_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
import pandas as pd
from src.python.common.tool_arguments import RunToolArgument
from src.python.evaluation.common.pandas_util import get_issues_from_json, get_solutions_df_by_file_path
from src.python.evaluation.common.util import ColumnName, EvaluationArgument
from src.python.evaluation.common.util import ColumnName, EvaluationArgument, parse_set_arg
from src.python.evaluation.inspectors.common.statistics import PenaltyIssue
from src.python.review.common.file_system import Extension, get_parent_folder, serialize_data_and_write_to_file
from src.python.review.inspectors.issue import BaseIssue


TRACEBACK = EvaluationArgument.TRACEBACK.value
Expand All @@ -26,16 +27,12 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None:
default='')


def __parse_issues_arg(str_issues: str) -> Set[str]:
return set(str_issues.split(','))


def __get_new_issues(traceback: str, new_issues_classes: Set[str]) -> List[PenaltyIssue]:
all_issues = get_issues_from_json(traceback)
return list(filter(lambda i: i.origin_class in new_issues_classes, all_issues))


def __add_issues_for_fragment(fragment_id: int, new_issues: List[PenaltyIssue], diffs: dict) -> None:
def __add_issues_for_fragment(fragment_id: int, new_issues: List[BaseIssue], diffs: dict) -> None:
if len(new_issues) > 0:
diffs[TRACEBACK][fragment_id] = new_issues

Expand All @@ -59,7 +56,7 @@ def main() -> None:

solutions_file_path = args.solutions_file_path
solutions_df = get_solutions_df_by_file_path(solutions_file_path)
issues = __parse_issues_arg(args.issues)
issues = parse_set_arg(args.issues)

diffs = get_statistics_dict(solutions_df, issues)
output_path = get_parent_folder(Path(solutions_file_path)) / f'diffs{Extension.PICKLE.value}'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None:
help='If True, statistics will be categorized by several categories.',
action='store_true')

parser.add_argument('-n', '--top_n',
parser.add_argument('-n', '--top-n',
help='The top N items will be printed',
type=int,
default=10)

parser.add_argument('--full_stat',
parser.add_argument('--full-stat',
help='If True, full statistics will be printed.',
action='store_true')

Expand Down
212 changes: 212 additions & 0 deletions src/python/evaluation/qodana/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,215 @@ Run the [dataset_labeling.py](dataset_labeling.py) with the arguments from comma
| **‑l**, **‑‑limit** | Allows you to read only the specified number of first rows from the dataset. If no limit is specified, the whole dataset will be processed. |
| **‑s**, **‑‑chunk‑size** | The number of files that Qodana will process at a time. Default is `5000`. |
| **‑o**, **‑‑output‑path** | The path where the labeled dataset will be saved. If not specified, the original dataset will be overwritten. |

---

# Postprocessing

The model that imitates Qodana analysis gets input from a dataset in a special format.
This module allows preparing datasets that were graded by [dataset_marking.py](dataset_marking.py) script.

Data processing consists of several stages:
- union several `csv` files that were graded by [dataset_marking.py](dataset_marking.py) script
and filter inspections list if it is necessary;
- get all unique inspections from the dataset;
- convert `csv` file into a special format.

## Filter inspections

This stage allow you to union several `csv` files that were graded by [dataset_marking.py](dataset_marking.py) script
and filter inspections list if it is necessary.

Please, note that your all input files must be graded by [dataset_marking.py](dataset_marking.py) script
and have `inspections` column.

Output file is a new `csv` file with the all columns from the input files.

#### Usage

Run the [filter_inspections.py](filter_inspections.py) with the arguments from command line.

Required arguments:

`dataset_folder` — path to a folder with csv files graded by Qodana. Each file must have `inspections` column.

Optional arguments:
Argument | Description
--- | ---
|**‑i**, **‑‑inspections**| Set of inspections ids to exclude from the dataset separated by comma. By default all inspections remain. |

The resulting file will be stored in the `dataset_folder`.

___

## Get all unique inspections

This stage allow you to get all unique inspections from a `csv` file graded by Qodana.
Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script
and has `inspections` column.

Output file is a new `csv` file with four columns: `id`, `inspection_id`, `count_all`, `count_uniq`.
`id` is unique number for each inspection, minimal value is 1.
`inspection_id` is unique Qoadana id for each inspection.
`count_all` count all fragments where was this inspection (with duplicates).
`count_uniq` count all fragments where was this inspection (without duplicates).

#### Usage

Run the [get_unique_inspectors.py](get_unique_inspectors.py) with the arguments from command line.

Required arguments:

`solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script.

Optional arguments:
Argument | Description
--- | ---
|**‑‑uniq**| To count all fragments for each inspection where was this inspection (without duplicates). By default it disabled. |

The resulting file will be stored in the same folder as the input file.

An example of the output file:

```json
id | inspection_id | count_all | count_unique
-----|---------------------|--------------|--------------
1 | SystemOutErr | 5 | 2
2 | ConstantExpression | 1 | 1
```

___

#### Convert `csv` file into a special format

This block describes what format can be converted csv-file with code samples
graded by [dataset_marking.py](dataset_marking.py) script.

We have two different formats:
- fragment to inspections list;
- fragment to inspections list with positions.


#### Fragment to inspections list

This data representation match code fragments to a list with ids of inspections.

Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script
and has `inspections` column.

Output file is a new `csv` file with a new `inspections` column with list with ids of inspections.
If the list of inspections for the fragment is empty, then write 0.

#### Usage

Run the [fragment_to_inspections_list.py](fragment_to_inspections_list.py) with the arguments from command line.

Required arguments:

- `solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script,
- `inspections_path` — path to csv-file with inspections list from the input file. You can get this file by [get_unique_inspectors.py](get_unique_inspectors.py) script.

Optional arguments:
Argument | Description
--- | ---
|**‑‑remove‑duplicates**| Remove duplicates around inspections in each row. Default value is `False`. |

The resulting file will be stored in the same folder as the input file.

An example of the input file:

```json
id | code | lang | inspections
-----|-------------------|---------------|-----------------
2 | "// some code" | java11 | "{""issues"": []}"
3 | "// some code" | java11 | "{""issues"": [""{\"... \""problem_id\"": \""SystemOutErr\""}""]}"
0 | "// some code" | java11 | "{""issues"": [""{\"...\""problem_id\"": \""ConstantExpression\""}"",""{\"...\""problem_id\"": \""ConstantExpression\""}""]}"
1 | "// some code" | java11 | "{""issues"": []}"
```

with the inspections file:

```json
id | inspection_id
-----|-------------------
1 | SystemOutErr
2 | ConstantExpression
```

An example of the output file:

```json
id | code | lang | inspections
-----|-------------------|---------------|-----------------
2 | "// some code" | java11 | 0
3 | "// some code" | java11 | 1
0 | "// some code" | java11 | 2,2
1 | "// some code" | java11 | 0

```

---

#### Fragment to inspections list with positions

This data representation match each line in code fragments to a list with ids of inspections in this line.

Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script
and has `inspections` column.

Output file is a new `csv` file with a new `inspections` column with list with ids of inspections.
If the list of inspections for the fragment is empty, then write 0.
Note, that each line in code fragments in the new file is stored in a separate row.
All indents as well as blank lines are keeped.

#### Usage

Run the [fragment_to_inspections_list_line_by_line.py](fragment_to_inspections_list_line_by_line.py) with the arguments from command line.

Required arguments:

- `solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script,
- `inspections_path` — path to csv-file with inspections list from the input file. You can get this file by [get_unique_inspectors.py](get_unique_inspectors.py) script.

Optional arguments:
Argument | Description
--- | ---
|**‑‑remove‑duplicates**| Remove duplicates around inspections in each row. Default value is `False`. |

The resulting file will be stored in the same folder as the input file.

An example of the input file:

```json
id | code | lang | inspections
-----|-------------------|---------------|-----------------
2 | "// some code" | java11 | "{""issues"": []}"
3 | "// some code" | java11 | "{""issues"": [""{\"... \""problem_id\"": \""SystemOutErr\""}""]}"
0 | "// some code" | java11 | "{""issues"": [""{\"...\""problem_id\"": \""ConstantExpression\""}"",""{\"...\""problem_id\"": \""ConstantExpression\""}""]}"
1 | "// some code" | java11 | "{""issues"": []}"
```

with the inspections file:

```json
id | inspection_id
-----|-------------------
1 | SystemOutErr
2 | ConstantExpression
```

An example of the output file:

```json
id | code | lang | inspections
-----|----------------------------------------|---------------|-----------------
2 | "// first line from code with id 2" | java11 | 0
2 | "// second line from code with id 2" | java11 | 0
3 | "// first line from code with id 3" | java11 | 1
3 | "// second line from code with id 3" | java11 | 0
0 | "// first line from code with id 0" | java11 | 0
0 | "// second line from code with id 0" | java11 | 2,2
1 | "// first line from code with id 1" | java11 | 0
1 | "// second line from code with id 1" | java11 | 0

```
12 changes: 3 additions & 9 deletions src/python/evaluation/qodana/dataset_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
import pandas as pd
from src.python.evaluation.common.csv_util import write_dataframe_to_csv
from src.python.evaluation.common.util import ColumnName
from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue, QodanaJsonField
from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue
from src.python.evaluation.qodana.util.util import to_json
from src.python.review.application_config import LanguageVersion
from src.python.review.common.file_system import (
copy_directory,
Expand Down Expand Up @@ -186,13 +187,6 @@ def _parse_inspections_files(cls, inspections_files: List[Path]) -> Dict[int, Li
id_to_issues[fragment_id].append(qodana_issue)
return id_to_issues

@classmethod
def _to_json(cls, issues: List[QodanaIssue]) -> str:
issues_json = {
QodanaJsonField.ISSUES.value: list(map(lambda i: i.to_json(), issues)),
}
return json.dumps(issues_json)

def _label_chunk(self, chunk: pd.DataFrame, language: LanguageVersion, chunk_id: int) -> pd.DataFrame:
tmp_dir_path = self.dataset_path.parent.absolute() / f'qodana_project_{chunk_id}'
create_directory(tmp_dir_path)
Expand All @@ -219,7 +213,7 @@ def _label_chunk(self, chunk: pd.DataFrame, language: LanguageVersion, chunk_id:

logger.info('Write inspections')
chunk[QodanaColumnName.INSPECTIONS.value] = chunk.apply(
lambda row: self._to_json(inspections.get(row[ColumnName.ID.value], [])), axis=1,
lambda row: to_json(inspections.get(row[ColumnName.ID.value], [])), axis=1,
)

remove_directory(tmp_dir_path)
Expand Down
Loading