diff --git a/src/python/evaluation/issues_statistics/get_raw_issues.py b/src/python/evaluation/issues_statistics/get_raw_issues.py index b077f34f..9aaaac7d 100644 --- a/src/python/evaluation/issues_statistics/get_raw_issues.py +++ b/src/python/evaluation/issues_statistics/get_raw_issues.py @@ -9,13 +9,14 @@ sys.path.append('') sys.path.append('../../..') +import numpy as np import pandas as pd from pandarallel import pandarallel from src.python.common.tool_arguments import RunToolArgument from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path, write_df_to_file from src.python.evaluation.common.util import ColumnName -from src.python.evaluation.evaluation_run_tool import get_language_version from src.python.evaluation.issues_statistics.common.raw_issue_encoder_decoder import RawIssueEncoder +from src.python.review.application_config import LanguageVersion from src.python.review.common.file_system import ( create_file, Extension, @@ -38,6 +39,12 @@ ALLOWED_EXTENSION = {Extension.XLSX, Extension.CSV} +ERROR_CODES = [ + 'E999', # flake8 + 'WPS000', # flake8 (wps) + 'E0001', # pylint +] + logger = logging.getLogger(__name__) @@ -109,10 +116,13 @@ def _filter_issues( return filtered_issues +def _check_issues_for_errors(issues: List[BaseIssue]) -> bool: + origin_classes = {issue.origin_class for issue in issues} + return any(error_code in origin_classes for error_code in ERROR_CODES) + + def _inspect_row( - fragment_id: int, - code: str, - language_code: str, + row: pd.Series, solutions_file_path: Path, allow_duplicates: bool, allow_zero_measure_issues: bool, @@ -120,28 +130,38 @@ def _inspect_row( to_safe_path: bool, ) -> Optional[str]: + print(f'{row[ID]}: processing started') + + if pd.isnull(row[LANG]): + logger.warning(f'{row[ID]}: no lang.') + return np.nan + + if pd.isnull(row[CODE]): + logger.warning(f'{row[ID]}: no code.') + return np.nan + # If we were unable to identify the language version, we return None - try: - language_version = get_language_version(language_code) - except KeyError: - logger.warning(f'{fragment_id}: it was not possible to determine the language version from "{language_code}"') - return None + language_version = LanguageVersion.from_value(row[LANG]) + if language_version is None: + logger.warning(f'{row[ID]}: it was not possible to determine the language version from "{row[LANG]}"') + return np.nan # If we were unable to identify the language, we return None language = Language.from_language_version(language_version) if language == Language.UNKNOWN: - logger.warning(f'{fragment_id}: it was not possible to determine the language from "{language_version}"') - return None + logger.warning(f'{row[ID]}: it was not possible to determine the language from "{language_version}"') + return np.nan # If there are no inspectors for the language, then return None inspectors = LANGUAGE_TO_INSPECTORS.get(language, []) if not inspectors: - logger.warning(f'{fragment_id}: no inspectors were found for the {language}.') - return None + logger.warning(f'{row[ID]}: no inspectors were found for the {language}.') + return np.nan tmp_file_extension = language_version.extension_by_language().value - tmp_file_path = solutions_file_path.parent.absolute() / f'fragment_{fragment_id}{tmp_file_extension}' - temp_file = next(create_file(tmp_file_path, code)) + tmp_file_path = solutions_file_path.parent.absolute() / f'fragment_{row[ID]}{tmp_file_extension}' + temp_file = next(create_file(tmp_file_path, row[CODE])) + inspectors_config = { 'language_version': language_version, 'n_cpu': 1, @@ -151,15 +171,26 @@ def _inspect_row( for inspector in inspectors: try: - raw_issues.extend(inspector.inspect(temp_file, inspectors_config)) + issues = inspector.inspect(temp_file, inspectors_config) + + if _check_issues_for_errors(issues): + logger.warning(f'{row[ID]}: inspector {inspector.inspector_type.value} failed.') + continue + + raw_issues.extend(issues) + except Exception: - logger.warning(f'{fragment_id}: inspector {inspector.inspector_type.value} failed.') + logger.warning(f'{row[ID]}: inspector {inspector.inspector_type.value} failed.') os.remove(temp_file) raw_issues = _filter_issues(raw_issues, allow_duplicates, allow_zero_measure_issues, allow_info_issues) - return json.dumps(raw_issues, cls=RawIssueEncoder, to_safe_path=to_safe_path) + json_issues = json.dumps(raw_issues, cls=RawIssueEncoder, to_safe_path=to_safe_path) + + print(f'{row[ID]}: processing finished.') + + return json_issues def _is_correct_output_path(output_path: Path) -> bool: @@ -195,16 +226,8 @@ def inspect_solutions( pandarallel.initialize() solutions_df[RAW_ISSUES] = solutions_df.parallel_apply( - lambda row: _inspect_row( - row[ID], - row[CODE], - row[LANG], - solutions_file_path, - allow_duplicates, - allow_zero_measure_issues, - allow_info_issues, - to_save_path, - ), + _inspect_row, + args=(solutions_file_path, allow_duplicates, allow_zero_measure_issues, allow_info_issues, to_save_path), axis=1, ) @@ -215,10 +238,18 @@ def main() -> None: parser = argparse.ArgumentParser() configure_arguments(parser) args = parser.parse_args() - logging.basicConfig(filename=args.log_output) + + if args.log_output is not None: + args.log_output.parent.mkdir(parents=True, exist_ok=True) + + logging.basicConfig( + filename=args.log_output, filemode='w', level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s', + ) solutions = get_solutions_df_by_file_path(args.solutions_file_path) + logger.info('Dataset inspection started.') + solutions_with_raw_issues = inspect_solutions( solutions, args.solutions_file_path, @@ -228,10 +259,17 @@ def main() -> None: args.to_save_path, ) + logger.info('Dataset inspection finished.') + output_path = _get_output_path(args.solutions_file_path, args.output) output_extension = Extension.get_extension_from_file(str(output_path)) + + logger.info(f'Saving the dataframe to a file: {output_path}.') + write_df_to_file(solutions_with_raw_issues, output_path, output_extension) + logger.info('Saving complete.') + if __name__ == '__main__': main() diff --git a/test/python/evaluation/issues_statistics/test_get_raw_issues.py b/test/python/evaluation/issues_statistics/test_get_raw_issues.py index cafa3b63..ba9a8610 100644 --- a/test/python/evaluation/issues_statistics/test_get_raw_issues.py +++ b/test/python/evaluation/issues_statistics/test_get_raw_issues.py @@ -294,6 +294,7 @@ def test_filter_issues( ('test_fragment_per_language.csv', 'target_fragment_per_language.csv'), ('test_incorrect_language.csv', 'target_incorrect_language.csv'), ('test_incorrect_code.csv', 'target_incorrect_code.csv'), + ('test_rows_with_null.csv', 'target_rows_with_null.csv'), ] diff --git a/test/resources/evaluation/issues_statistics/get_raw_issues/target_files/target_incorrect_code.csv b/test/resources/evaluation/issues_statistics/get_raw_issues/target_files/target_incorrect_code.csv index 305aa89e..898ec886 100644 --- a/test/resources/evaluation/issues_statistics/get_raw_issues/target_files/target_incorrect_code.csv +++ b/test/resources/evaluation/issues_statistics/get_raw_issues/target_files/target_incorrect_code.csv @@ -1,15 +1,15 @@ id,lang,code,raw_issues -2760103,java8,"import java.util.*; +1,java11,"public class Main { + public static void main(String[] args) { + + int variable = 123456; // Change this line + + System.out.println(variable); + } -fun main(args: Array) { - val scanner = Scanner(System.`in`) - // put your code here - val sq = scanner.nextDouble() - val sqrt = Math.sqrt(Math.sqrt(sq)) - println(sqrt) ",[] -2760563,python3,"text = input() +2,python3,"text = input() words = text.split() for word in words: # finish the code here @@ -27,4 +27,4 @@ for word in words: print(word) else: continue -","[{""origin_class"": ""E0001"", ""type"": ""ERROR_PRONE"", ""description"": ""invalid syntax (, line 5)"", ""file_path"": """", ""line_no"": 5, ""column_no"": 32, ""inspector_type"": ""PYLINT""}, {""origin_class"": ""E999"", ""type"": ""CODE_STYLE"", ""description"": ""SyntaxError: invalid syntax"", ""file_path"": """", ""line_no"": 5, ""column_no"": 30, ""inspector_type"": ""FLAKE8""}, {""origin_class"": ""E113"", ""type"": ""CODE_STYLE"", ""description"": ""unexpected indentation"", ""file_path"": """", ""line_no"": 6, ""column_no"": 9, ""inspector_type"": ""FLAKE8""}]" +",[] diff --git a/test/resources/evaluation/issues_statistics/get_raw_issues/target_files/target_rows_with_null.csv b/test/resources/evaluation/issues_statistics/get_raw_issues/target_files/target_rows_with_null.csv new file mode 100644 index 00000000..79c66788 --- /dev/null +++ b/test/resources/evaluation/issues_statistics/get_raw_issues/target_files/target_rows_with_null.csv @@ -0,0 +1,3 @@ +id,lang,code,raw_issues +1,,"print(""Hello, World!"")", +2,python3,, \ No newline at end of file diff --git a/test/resources/evaluation/issues_statistics/get_raw_issues/test_files/test_incorrect_code.csv b/test/resources/evaluation/issues_statistics/get_raw_issues/test_files/test_incorrect_code.csv index da6058fb..12bb51c9 100644 --- a/test/resources/evaluation/issues_statistics/get_raw_issues/test_files/test_incorrect_code.csv +++ b/test/resources/evaluation/issues_statistics/get_raw_issues/test_files/test_incorrect_code.csv @@ -1,15 +1,15 @@ id,lang,code -2760103,java8,"import java.util.*; +1,java11,"public class Main { + public static void main(String[] args) { + + int variable = 123456; // Change this line + + System.out.println(variable); + } -fun main(args: Array) { - val scanner = Scanner(System.`in`) - // put your code here - val sq = scanner.nextDouble() - val sqrt = Math.sqrt(Math.sqrt(sq)) - println(sqrt) " -2760563,python3,"text = input() +2,python3,"text = input() words = text.split() for word in words: # finish the code here diff --git a/test/resources/evaluation/issues_statistics/get_raw_issues/test_files/test_rows_with_null.csv b/test/resources/evaluation/issues_statistics/get_raw_issues/test_files/test_rows_with_null.csv new file mode 100644 index 00000000..97cc5fb9 --- /dev/null +++ b/test/resources/evaluation/issues_statistics/get_raw_issues/test_files/test_rows_with_null.csv @@ -0,0 +1,3 @@ +id,lang,code +1,,"print(""Hello, World!"")" +2,python3, \ No newline at end of file