From c00ca2e96d56dce4bf38464302df32adb0e85470 Mon Sep 17 00:00:00 2001 From: "Maximilian R." Date: Sat, 30 Sep 2023 17:40:39 -0500 Subject: [PATCH 1/3] Clean up code --- codebleu/codebleu.py | 17 +++-------------- evaluate_app/codebleu.py | 2 +- tests/test_codebleu.py | 5 +++-- 3 files changed, 7 insertions(+), 17 deletions(-) diff --git a/codebleu/codebleu.py b/codebleu/codebleu.py index 76aa6b3..b35e597 100644 --- a/codebleu/codebleu.py +++ b/codebleu/codebleu.py @@ -7,7 +7,6 @@ from . import bleu, dataflow_match, syntax_match, weighted_ngram_match PACKAGE_DIR = Path(__file__).parent -# AVAILABLE_LANGS = ['java', 'javascript', 'c_sharp', 'php', 'go', 'python', 'ruby'] AVAILABLE_LANGS = ["java", "javascript", "c_sharp", "php", "c", "cpp", "python"] # keywords available @@ -56,7 +55,8 @@ def tokenizer(s): ngram_match_score = bleu.corpus_bleu(tokenized_refs, tokenized_hyps) # calculate weighted ngram match - keywords = [x.strip() for x in open(keywords_dir / (lang + ".txt"), "r", encoding="utf-8").readlines()] + with open(keywords_dir / (lang + ".txt"), "r", encoding="utf-8") as f: + keywords = [x.strip() for x in f.readlines()] def make_weights(reference_tokens, key_word_list): return {token: 1 if token in key_word_list else 0.2 for token in reference_tokens} @@ -74,25 +74,14 @@ def make_weights(reference_tokens, key_word_list): # calculate dataflow match dataflow_match_score = dataflow_match.corpus_dataflow_match(references, hypothesis, lang, lang_so_file) - # print( - # "ngram match: {0}, weighted ngram match: {1}, syntax_match: {2}, dataflow_match: {3}".format( - # ngram_match_score, - # weighted_ngram_match_score, - # syntax_match_score, - # dataflow_match_score, - # ) - # ) - alpha, beta, gamma, theta = weights code_bleu_score = ( alpha * ngram_match_score + beta * weighted_ngram_match_score + gamma * syntax_match_score - + theta * (dataflow_match_score or 1) + + theta * (dataflow_match_score or 0) ) - # print("CodeBLEU score: ", code_bleu_score) - return { "codebleu": code_bleu_score, "ngram_match_score": ngram_match_score, diff --git a/evaluate_app/codebleu.py b/evaluate_app/codebleu.py index dd86973..bb365ba 100644 --- a/evaluate_app/codebleu.py +++ b/evaluate_app/codebleu.py @@ -106,7 +106,7 @@ def _info(self): def _download_and_prepare(self, dl_manager): """Optional: download external resources useful to compute the scores""" # workarounds as this file have to be named codebleu (evaluate library requirement) - self.codebleu_package = importlib.import_module('codebleu') + self.codebleu_package = importlib.import_module("codebleu") pass def _compute(self, predictions, references, lang, weights=(0.25, 0.25, 0.25, 0.25), tokenizer=None): diff --git a/tests/test_codebleu.py b/tests/test_codebleu.py index 1102415..ec016cd 100644 --- a/tests/test_codebleu.py +++ b/tests/test_codebleu.py @@ -7,7 +7,8 @@ @pytest.mark.parametrize(['predictions', 'references', 'codebleu'], [ - (['some rannnndom words in length more than 3'], ['def test ( ) :\n pass'], 0.25), # 'cause data_flow is 0 and considered as 1 + (['some rannnndom words in length more than 3'], + ['def test ( ) :\n pass'], 0.25), # 'cause data_flow is 0 and considered as 1 (['def bar ( y , x ) :\n a = x * x\n return a'], ['def foo ( x ) :\n return x'], 0.4), (['def foo ( x ) :\n return x * x'], ['def bar ( x ) :\n return x'], 0.6), (['def bar ( x ) :\n return x'], ['def foo ( x ) :\n return x'], 0.8), @@ -19,7 +20,7 @@ def test_simple_cases(predictions: List[Any], references: List[Any], codebleu: f assert result['codebleu'] == pytest.approx(codebleu, 0.1) -@pytest.mark.parametrize(['lang'], [(l,) for l in AVAILABLE_LANGS]) +@pytest.mark.parametrize(['lang'], [(lang,) for lang in AVAILABLE_LANGS]) def test_exact_match_works_for_all_langs(lang: str) -> None: predictions = references = ['some matching string a couple of times'] assert calc_codebleu(references, predictions, lang)['codebleu'] == 1.0 From fb7893c809ff8b72f6ac01ed26e343f7d6647940 Mon Sep 17 00:00:00 2001 From: "Maximilian R." Date: Sun, 1 Oct 2023 14:31:21 -0500 Subject: [PATCH 2/3] Fix tests + logging --- codebleu/codebleu.py | 2 +- codebleu/dataflow_match.py | 4 ++-- codebleu/weighted_ngram_match.py | 2 -- tests/test_codebleu.py | 19 ++++++++++--------- 4 files changed, 13 insertions(+), 14 deletions(-) diff --git a/codebleu/codebleu.py b/codebleu/codebleu.py index b35e597..e4d928f 100644 --- a/codebleu/codebleu.py +++ b/codebleu/codebleu.py @@ -79,7 +79,7 @@ def make_weights(reference_tokens, key_word_list): alpha * ngram_match_score + beta * weighted_ngram_match_score + gamma * syntax_match_score - + theta * (dataflow_match_score or 0) + + theta * (dataflow_match_score or 1.0) ) return { diff --git a/codebleu/dataflow_match.py b/codebleu/dataflow_match.py index bcd89ac..2e4217b 100644 --- a/codebleu/dataflow_match.py +++ b/codebleu/dataflow_match.py @@ -1,6 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. - +import logging from tree_sitter import Language, Parser from .parser import ( @@ -67,7 +67,7 @@ def corpus_dataflow_match(references, candidates, lang, langso_so_file): match_count += 1 normalized_cand_dfg.remove(dataflow) if total_count == 0: - print( + logging.warning( "WARNING: There is no reference data-flows extracted from the whole corpus, " "and the data-flow match score degenerates to 0. Please consider ignoring this score." ) diff --git a/codebleu/weighted_ngram_match.py b/codebleu/weighted_ngram_match.py index d03a04a..507cb76 100644 --- a/codebleu/weighted_ngram_match.py +++ b/codebleu/weighted_ngram_match.py @@ -192,7 +192,6 @@ def corpus_bleu( # it tries to retain the Fraction object as much as the # smoothing method allows. p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths) - # pdb.set_trace() s = (w_i * math.log(p_i[0] / p_i[1]) for w_i, p_i in zip(weights, p_n)) s = bp * math.exp(math.fsum(s)) return s @@ -212,7 +211,6 @@ def modified_recall(references, hypothesis, n): """ # Extracts all ngrams in hypothesis # Set an empty Counter if hypothesis is empty. - # pdb.set_trace() numerator = 0 denominator = 0 diff --git a/tests/test_codebleu.py b/tests/test_codebleu.py index ec016cd..3875a94 100644 --- a/tests/test_codebleu.py +++ b/tests/test_codebleu.py @@ -2,6 +2,7 @@ from typing import Any, List import pytest +import logging from codebleu.codebleu import AVAILABLE_LANGS, calc_codebleu @@ -16,7 +17,7 @@ ]) def test_simple_cases(predictions: List[Any], references: List[Any], codebleu: float) -> None: result = calc_codebleu(references, predictions, 'python') - print(result) + logging.debug(result) assert result['codebleu'] == pytest.approx(codebleu, 0.1) @@ -37,7 +38,7 @@ def test_exact_match_works_for_all_langs(lang: str) -> None: ]) def test_simple_cases_work_for_all_langs(lang: str, predictions: List[Any], references: List[Any]) -> None: result = calc_codebleu(references, predictions, lang) - print(result) + logging.debug(result) assert result['codebleu'] == pytest.approx(0.6, 0.1) @@ -55,17 +56,17 @@ def test_error_when_input_length_mismatch() -> None: ( ['public static int Sign ( double d ) { return ( float ) ( ( d == 0 ) ? 0 : ( c < 0.0 ) ? - 1 : 1) ; }'], ['public static int Sign ( double d ) { return ( int ) ( ( d == 0 ) ? 0 : ( d < 0 ) ? - 1 : 1) ; }'], - 0.7238 + 0.7019 + ), + ( + ['public static int Sign ( double c ) { return ( int ) ( ( c == 0 ) ? 0 : ( c < 0 ) ? - 1 : 1) ; }'], + ['public static int Sign ( double d ) { return ( int ) ( ( d == 0 ) ? 0 : ( d < 0 ) ? - 1 : 1) ; }'], + 0.8804 ), - # ( - # ['public static int Sign ( double c ) { return ( int ) ( ( c == 0 ) ? 0 : ( c < 0 ) ? - 1 : 1) ; }'], - # ['public static int Sign ( double d ) { return ( int ) ( ( d == 0 ) ? 0 : ( d < 0 ) ? - 1 : 1) ; }'], - # 0.8397 - # ), ]) def test_code_x_glue_readme_examples(predictions: List[Any], references: List[Any], codebleu: float) -> None: result = calc_codebleu(references, predictions, 'java') - print(result) + logging.debug(result) assert result['codebleu'] == pytest.approx(codebleu, 0.01) From 8d4a1f16f6b8237d7bd41ac1a80edbceceef6ed9 Mon Sep 17 00:00:00 2001 From: "Maximilian R." Date: Mon, 2 Oct 2023 22:19:41 -0500 Subject: [PATCH 3/3] update --- codebleu/codebleu.py | 2 +- codebleu/dataflow_match.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/codebleu/codebleu.py b/codebleu/codebleu.py index e4d928f..1319edd 100644 --- a/codebleu/codebleu.py +++ b/codebleu/codebleu.py @@ -79,7 +79,7 @@ def make_weights(reference_tokens, key_word_list): alpha * ngram_match_score + beta * weighted_ngram_match_score + gamma * syntax_match_score - + theta * (dataflow_match_score or 1.0) + + theta * (dataflow_match_score or 1) ) return { diff --git a/codebleu/dataflow_match.py b/codebleu/dataflow_match.py index 2e4217b..30e3871 100644 --- a/codebleu/dataflow_match.py +++ b/codebleu/dataflow_match.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import logging + from tree_sitter import Language, Parser from .parser import (