Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
2ef00d9
Fixed get_output_path test data
GirZ0n Jul 20, 2021
461b511
Moved __get_total_lines to file_system.py
GirZ0n Jul 26, 2021
bec0701
Added get_total_code_lines_from_file and get_total_code_lines_from_code
GirZ0n Jul 26, 2021
e5085cb
Added get_raw_issues_statistics
GirZ0n Jul 26, 2021
71caba1
Renamed main_stats -> freq_stats and other_stats -> ratio_stats
GirZ0n Jul 26, 2021
223c1d1
Small fix
GirZ0n Jul 26, 2021
207e49d
Added new data folders
GirZ0n Jul 26, 2021
df4bef2
Added tests
GirZ0n Jul 26, 2021
63104d8
Removed duplicates
GirZ0n Jul 26, 2021
3fb8b5e
Removed unnecessary line
GirZ0n Jul 26, 2021
2fcfede
Now the script returns only one dataframe
GirZ0n Jul 26, 2021
7152112
Fixed tests
GirZ0n Jul 26, 2021
49fec5e
Added new tests
GirZ0n Jul 26, 2021
531ee40
Added logger and small code refactoring
GirZ0n Jul 26, 2021
d7f98bc
Added some more logging
GirZ0n Jul 26, 2021
2bb65a8
Fixed test
GirZ0n Jul 26, 2021
c3bbf45
Fixed test
GirZ0n Jul 26, 2021
117961c
Fixed help message
GirZ0n Jul 26, 2021
32d8dc9
Update README.md
GirZ0n Jul 26, 2021
df5929d
Added more logging
GirZ0n Jul 27, 2021
39106af
statistics -> issues_statistics
GirZ0n Jul 27, 2021
28a9fe6
Fixed flake8
GirZ0n Jul 27, 2021
153ec3a
Added from_value function
GirZ0n Jul 30, 2021
bd14b02
Added comment
GirZ0n Jul 30, 2021
bf5b649
Added get_ratio
GirZ0n Jul 30, 2021
2bc1d17
Small refactoring: added get_ratio
GirZ0n Jul 30, 2021
000c1cb
Fixed PR issues
GirZ0n Jul 30, 2021
255c309
Merge branch 'develop' into statistics
GirZ0n Jul 30, 2021
06d687e
Small fixes
GirZ0n Jul 30, 2021
b1c988e
Merge remote-tracking branch 'origin/statistics' into statistics
GirZ0n Jul 30, 2021
de45c09
Added isnull
GirZ0n Jul 30, 2021
5c4e0d9
typo fix
GirZ0n Jul 30, 2021
c7f9efc
Fixed tests
GirZ0n Jul 30, 2021
38657fa
Added --log-output and fixed null checks
GirZ0n Jul 30, 2021
c39599d
Added filemode
GirZ0n Jul 30, 2021
31c34ed
typo fix
GirZ0n Jul 30, 2021
8780d0e
Fixed tests
GirZ0n Jul 30, 2021
37a9393
Fixed tests
GirZ0n Jul 30, 2021
b91dbbe
Merge branch 'develop' into script-fix
GirZ0n Jul 30, 2021
da232b9
Merge remote-tracking branch 'origin/statistics' into script-fix
GirZ0n Jul 30, 2021
2b3397d
Code refactoring and bug fixing:
GirZ0n Jul 30, 2021
0422db9
Fixed test
GirZ0n Jul 30, 2021
d96e1ad
Added new test
GirZ0n Aug 2, 2021
babd90b
Replaced None with np.nan
GirZ0n Aug 2, 2021
866e24a
Fixed test
GirZ0n Aug 2, 2021
5cbd389
Removed unnecessary space
GirZ0n Aug 2, 2021
e914166
Merge branch 'develop' into script-fix
GirZ0n Aug 2, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 67 additions & 29 deletions src/python/evaluation/issues_statistics/get_raw_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@
sys.path.append('')
sys.path.append('../../..')

import numpy as np
import pandas as pd
from pandarallel import pandarallel
from src.python.common.tool_arguments import RunToolArgument
from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path, write_df_to_file
from src.python.evaluation.common.util import ColumnName
from src.python.evaluation.evaluation_run_tool import get_language_version
from src.python.evaluation.issues_statistics.common.raw_issue_encoder_decoder import RawIssueEncoder
from src.python.review.application_config import LanguageVersion
from src.python.review.common.file_system import (
create_file,
Extension,
Expand All @@ -38,6 +39,12 @@

ALLOWED_EXTENSION = {Extension.XLSX, Extension.CSV}

ERROR_CODES = [
'E999', # flake8
'WPS000', # flake8 (wps)
'E0001', # pylint
]

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -109,39 +116,52 @@ def _filter_issues(
return filtered_issues


def _check_issues_for_errors(issues: List[BaseIssue]) -> bool:
origin_classes = {issue.origin_class for issue in issues}
return any(error_code in origin_classes for error_code in ERROR_CODES)


def _inspect_row(
fragment_id: int,
code: str,
language_code: str,
row: pd.Series,
solutions_file_path: Path,
allow_duplicates: bool,
allow_zero_measure_issues: bool,
allow_info_issues: bool,
to_safe_path: bool,
) -> Optional[str]:

print(f'{row[ID]}: processing started')

if pd.isnull(row[LANG]):
logger.warning(f'{row[ID]}: no lang.')
return np.nan

if pd.isnull(row[CODE]):
logger.warning(f'{row[ID]}: no code.')
return np.nan

# If we were unable to identify the language version, we return None
try:
language_version = get_language_version(language_code)
except KeyError:
logger.warning(f'{fragment_id}: it was not possible to determine the language version from "{language_code}"')
return None
language_version = LanguageVersion.from_value(row[LANG])
if language_version is None:
logger.warning(f'{row[ID]}: it was not possible to determine the language version from "{row[LANG]}"')
return np.nan

# If we were unable to identify the language, we return None
language = Language.from_language_version(language_version)
if language == Language.UNKNOWN:
logger.warning(f'{fragment_id}: it was not possible to determine the language from "{language_version}"')
return None
logger.warning(f'{row[ID]}: it was not possible to determine the language from "{language_version}"')
return np.nan

# If there are no inspectors for the language, then return None
inspectors = LANGUAGE_TO_INSPECTORS.get(language, [])
if not inspectors:
logger.warning(f'{fragment_id}: no inspectors were found for the {language}.')
return None
logger.warning(f'{row[ID]}: no inspectors were found for the {language}.')
return np.nan

tmp_file_extension = language_version.extension_by_language().value
tmp_file_path = solutions_file_path.parent.absolute() / f'fragment_{fragment_id}{tmp_file_extension}'
temp_file = next(create_file(tmp_file_path, code))
tmp_file_path = solutions_file_path.parent.absolute() / f'fragment_{row[ID]}{tmp_file_extension}'
temp_file = next(create_file(tmp_file_path, row[CODE]))

inspectors_config = {
'language_version': language_version,
'n_cpu': 1,
Expand All @@ -151,15 +171,26 @@ def _inspect_row(

for inspector in inspectors:
try:
raw_issues.extend(inspector.inspect(temp_file, inspectors_config))
issues = inspector.inspect(temp_file, inspectors_config)

if _check_issues_for_errors(issues):
logger.warning(f'{row[ID]}: inspector {inspector.inspector_type.value} failed.')
continue

raw_issues.extend(issues)

except Exception:
logger.warning(f'{fragment_id}: inspector {inspector.inspector_type.value} failed.')
logger.warning(f'{row[ID]}: inspector {inspector.inspector_type.value} failed.')

os.remove(temp_file)

raw_issues = _filter_issues(raw_issues, allow_duplicates, allow_zero_measure_issues, allow_info_issues)

return json.dumps(raw_issues, cls=RawIssueEncoder, to_safe_path=to_safe_path)
json_issues = json.dumps(raw_issues, cls=RawIssueEncoder, to_safe_path=to_safe_path)

print(f'{row[ID]}: processing finished.')

return json_issues


def _is_correct_output_path(output_path: Path) -> bool:
Expand Down Expand Up @@ -195,16 +226,8 @@ def inspect_solutions(
pandarallel.initialize()

solutions_df[RAW_ISSUES] = solutions_df.parallel_apply(
lambda row: _inspect_row(
row[ID],
row[CODE],
row[LANG],
solutions_file_path,
allow_duplicates,
allow_zero_measure_issues,
allow_info_issues,
to_save_path,
),
_inspect_row,
args=(solutions_file_path, allow_duplicates, allow_zero_measure_issues, allow_info_issues, to_save_path),
axis=1,
)

Expand All @@ -215,10 +238,18 @@ def main() -> None:
parser = argparse.ArgumentParser()
configure_arguments(parser)
args = parser.parse_args()
logging.basicConfig(filename=args.log_output)

if args.log_output is not None:
args.log_output.parent.mkdir(parents=True, exist_ok=True)

logging.basicConfig(
filename=args.log_output, filemode='w', level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s',
)

solutions = get_solutions_df_by_file_path(args.solutions_file_path)

logger.info('Dataset inspection started.')

solutions_with_raw_issues = inspect_solutions(
solutions,
args.solutions_file_path,
Expand All @@ -228,10 +259,17 @@ def main() -> None:
args.to_save_path,
)

logger.info('Dataset inspection finished.')

output_path = _get_output_path(args.solutions_file_path, args.output)
output_extension = Extension.get_extension_from_file(str(output_path))

logger.info(f'Saving the dataframe to a file: {output_path}.')

write_df_to_file(solutions_with_raw_issues, output_path, output_extension)

logger.info('Saving complete.')


if __name__ == '__main__':
main()
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ def test_filter_issues(
('test_fragment_per_language.csv', 'target_fragment_per_language.csv'),
('test_incorrect_language.csv', 'target_incorrect_language.csv'),
('test_incorrect_code.csv', 'target_incorrect_code.csv'),
('test_rows_with_null.csv', 'target_rows_with_null.csv'),
]


Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
id,lang,code,raw_issues
2760103,java8,"import java.util.*;
1,java11,"public class Main {
public static void main(String[] args) {

int variable = 123456; // Change this line

System.out.println(variable);
}

fun main(args: Array<String>) {
val scanner = Scanner(System.`in`)
// put your code here
val sq = scanner.nextDouble()
val sqrt = Math.sqrt(Math.sqrt(sq))
println(sqrt)

",[]
2760563,python3,"text = input()
2,python3,"text = input()
words = text.split()
for word in words:
# finish the code here
Expand All @@ -27,4 +27,4 @@ for word in words:
print(word)
else:
continue
","[{""origin_class"": ""E0001"", ""type"": ""ERROR_PRONE"", ""description"": ""invalid syntax (<unknown>, line 5)"", ""file_path"": """", ""line_no"": 5, ""column_no"": 32, ""inspector_type"": ""PYLINT""}, {""origin_class"": ""E999"", ""type"": ""CODE_STYLE"", ""description"": ""SyntaxError: invalid syntax"", ""file_path"": """", ""line_no"": 5, ""column_no"": 30, ""inspector_type"": ""FLAKE8""}, {""origin_class"": ""E113"", ""type"": ""CODE_STYLE"", ""description"": ""unexpected indentation"", ""file_path"": """", ""line_no"": 6, ""column_no"": 9, ""inspector_type"": ""FLAKE8""}]"
",[]
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
id,lang,code,raw_issues
1,,"print(""Hello, World!"")",
2,python3,,
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
id,lang,code
2760103,java8,"import java.util.*;
1,java11,"public class Main {
public static void main(String[] args) {

int variable = 123456; // Change this line

System.out.println(variable);
}

fun main(args: Array<String>) {
val scanner = Scanner(System.`in`)
// put your code here
val sq = scanner.nextDouble()
val sqrt = Math.sqrt(Math.sqrt(sq))
println(sqrt)

"
2760563,python3,"text = input()
2,python3,"text = input()
words = text.split()
for word in words:
# finish the code here
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
id,lang,code
1,,"print(""Hello, World!"")"
2,python3,