From c00ca2e96d56dce4bf38464302df32adb0e85470 Mon Sep 17 00:00:00 2001
From: "Maximilian R." <maximus12793@gmail.com>
Date: Sat, 30 Sep 2023 17:40:39 -0500
Subject: [PATCH 1/3] Clean up code

---
 codebleu/codebleu.py     | 17 +++--------------
 evaluate_app/codebleu.py |  2 +-
 tests/test_codebleu.py   |  5 +++--
 3 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/codebleu/codebleu.py b/codebleu/codebleu.py
index 76aa6b3..b35e597 100644
--- a/codebleu/codebleu.py
+++ b/codebleu/codebleu.py
@@ -7,7 +7,6 @@
 from . import bleu, dataflow_match, syntax_match, weighted_ngram_match
 
 PACKAGE_DIR = Path(__file__).parent
-# AVAILABLE_LANGS = ['java', 'javascript', 'c_sharp', 'php', 'go', 'python', 'ruby']
 AVAILABLE_LANGS = ["java", "javascript", "c_sharp", "php", "c", "cpp", "python"]  # keywords available
 
 
@@ -56,7 +55,8 @@ def tokenizer(s):
     ngram_match_score = bleu.corpus_bleu(tokenized_refs, tokenized_hyps)
 
     # calculate weighted ngram match
-    keywords = [x.strip() for x in open(keywords_dir / (lang + ".txt"), "r", encoding="utf-8").readlines()]
+    with open(keywords_dir / (lang + ".txt"), "r", encoding="utf-8") as f:
+        keywords = [x.strip() for x in f.readlines()]
 
     def make_weights(reference_tokens, key_word_list):
         return {token: 1 if token in key_word_list else 0.2 for token in reference_tokens}
@@ -74,25 +74,14 @@ def make_weights(reference_tokens, key_word_list):
     # calculate dataflow match
     dataflow_match_score = dataflow_match.corpus_dataflow_match(references, hypothesis, lang, lang_so_file)
 
-    # print(
-    #     "ngram match: {0}, weighted ngram match: {1}, syntax_match: {2}, dataflow_match: {3}".format(
-    #         ngram_match_score,
-    #         weighted_ngram_match_score,
-    #         syntax_match_score,
-    #         dataflow_match_score,
-    #     )
-    # )
-
     alpha, beta, gamma, theta = weights
     code_bleu_score = (
         alpha * ngram_match_score
         + beta * weighted_ngram_match_score
         + gamma * syntax_match_score
-        + theta * (dataflow_match_score or 1)
+        + theta * (dataflow_match_score or 0)
     )
 
-    # print("CodeBLEU score: ", code_bleu_score)
-
     return {
         "codebleu": code_bleu_score,
         "ngram_match_score": ngram_match_score,
diff --git a/evaluate_app/codebleu.py b/evaluate_app/codebleu.py
index dd86973..bb365ba 100644
--- a/evaluate_app/codebleu.py
+++ b/evaluate_app/codebleu.py
@@ -106,7 +106,7 @@ def _info(self):
     def _download_and_prepare(self, dl_manager):
         """Optional: download external resources useful to compute the scores"""
         # workarounds as this file have to be named codebleu (evaluate library requirement)
-        self.codebleu_package = importlib.import_module('codebleu')
+        self.codebleu_package = importlib.import_module("codebleu")
         pass
 
     def _compute(self, predictions, references, lang, weights=(0.25, 0.25, 0.25, 0.25), tokenizer=None):
diff --git a/tests/test_codebleu.py b/tests/test_codebleu.py
index 1102415..ec016cd 100644
--- a/tests/test_codebleu.py
+++ b/tests/test_codebleu.py
@@ -7,7 +7,8 @@
 
 
 @pytest.mark.parametrize(['predictions', 'references', 'codebleu'], [
-    (['some rannnndom words in length more than 3'], ['def test ( ) :\n pass'], 0.25),  # 'cause data_flow is 0 and considered as 1
+    (['some rannnndom words in length more than 3'],
+     ['def test ( ) :\n pass'], 0.25),  # 'cause data_flow is 0 and considered as 1
     (['def bar ( y , x ) :\n    a = x * x\n    return a'], ['def foo ( x ) :\n    return x'], 0.4),
     (['def foo ( x ) :\n    return x * x'], ['def bar ( x ) :\n    return x'], 0.6),
     (['def bar ( x ) :\n    return x'], ['def foo ( x ) :\n    return x'], 0.8),
@@ -19,7 +20,7 @@ def test_simple_cases(predictions: List[Any], references: List[Any], codebleu: f
     assert result['codebleu'] == pytest.approx(codebleu, 0.1)
 
 
-@pytest.mark.parametrize(['lang'], [(l,) for l in AVAILABLE_LANGS])
+@pytest.mark.parametrize(['lang'], [(lang,) for lang in AVAILABLE_LANGS])
 def test_exact_match_works_for_all_langs(lang: str) -> None:
     predictions = references = ['some matching string a couple of times']
     assert calc_codebleu(references, predictions, lang)['codebleu'] == 1.0

From fb7893c809ff8b72f6ac01ed26e343f7d6647940 Mon Sep 17 00:00:00 2001
From: "Maximilian R." <maximus12793@gmail.com>
Date: Sun, 1 Oct 2023 14:31:21 -0500
Subject: [PATCH 2/3] Fix tests + logging

---
 codebleu/codebleu.py             |  2 +-
 codebleu/dataflow_match.py       |  4 ++--
 codebleu/weighted_ngram_match.py |  2 --
 tests/test_codebleu.py           | 19 ++++++++++---------
 4 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/codebleu/codebleu.py b/codebleu/codebleu.py
index b35e597..e4d928f 100644
--- a/codebleu/codebleu.py
+++ b/codebleu/codebleu.py
@@ -79,7 +79,7 @@ def make_weights(reference_tokens, key_word_list):
         alpha * ngram_match_score
         + beta * weighted_ngram_match_score
         + gamma * syntax_match_score
-        + theta * (dataflow_match_score or 0)
+        + theta * (dataflow_match_score or 1.0)
     )
 
     return {
diff --git a/codebleu/dataflow_match.py b/codebleu/dataflow_match.py
index bcd89ac..2e4217b 100644
--- a/codebleu/dataflow_match.py
+++ b/codebleu/dataflow_match.py
@@ -1,6 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
-
+import logging
 from tree_sitter import Language, Parser
 
 from .parser import (
@@ -67,7 +67,7 @@ def corpus_dataflow_match(references, candidates, lang, langso_so_file):
                         match_count += 1
                         normalized_cand_dfg.remove(dataflow)
     if total_count == 0:
-        print(
+        logging.warning(
             "WARNING: There is no reference data-flows extracted from the whole corpus, "
             "and the data-flow match score degenerates to 0. Please consider ignoring this score."
         )
diff --git a/codebleu/weighted_ngram_match.py b/codebleu/weighted_ngram_match.py
index d03a04a..507cb76 100644
--- a/codebleu/weighted_ngram_match.py
+++ b/codebleu/weighted_ngram_match.py
@@ -192,7 +192,6 @@ def corpus_bleu(
     #       it tries to retain the Fraction object as much as the
     #       smoothing method allows.
     p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths)
-    # pdb.set_trace()
     s = (w_i * math.log(p_i[0] / p_i[1]) for w_i, p_i in zip(weights, p_n))
     s = bp * math.exp(math.fsum(s))
     return s
@@ -212,7 +211,6 @@ def modified_recall(references, hypothesis, n):
     """
     # Extracts all ngrams in hypothesis
     # Set an empty Counter if hypothesis is empty.
-    # pdb.set_trace()
     numerator = 0
     denominator = 0
 
diff --git a/tests/test_codebleu.py b/tests/test_codebleu.py
index ec016cd..3875a94 100644
--- a/tests/test_codebleu.py
+++ b/tests/test_codebleu.py
@@ -2,6 +2,7 @@
 from typing import Any, List
 
 import pytest
+import logging
 
 from codebleu.codebleu import AVAILABLE_LANGS, calc_codebleu
 
@@ -16,7 +17,7 @@
 ])
 def test_simple_cases(predictions: List[Any], references: List[Any], codebleu: float) -> None:
     result = calc_codebleu(references, predictions, 'python')
-    print(result)
+    logging.debug(result)
     assert result['codebleu'] == pytest.approx(codebleu, 0.1)
 
 
@@ -37,7 +38,7 @@ def test_exact_match_works_for_all_langs(lang: str) -> None:
 ])
 def test_simple_cases_work_for_all_langs(lang: str, predictions: List[Any], references: List[Any]) -> None:
     result = calc_codebleu(references, predictions, lang)
-    print(result)
+    logging.debug(result)
     assert result['codebleu'] == pytest.approx(0.6, 0.1)
 
 
@@ -55,17 +56,17 @@ def test_error_when_input_length_mismatch() -> None:
     (
         ['public static int Sign ( double d ) { return ( float ) ( ( d == 0 ) ? 0 : ( c < 0.0 ) ? - 1 : 1) ; }'],
         ['public static int Sign ( double d ) { return ( int ) ( ( d == 0 ) ? 0 : ( d < 0 ) ? - 1 : 1) ; }'],
-        0.7238
+        0.7019
+    ),
+    (
+        ['public static int Sign ( double c ) { return ( int ) ( ( c == 0 ) ? 0 : ( c < 0 ) ? - 1 : 1) ; }'],
+        ['public static int Sign ( double d ) { return ( int ) ( ( d == 0 ) ? 0 : ( d < 0 ) ? - 1 : 1) ; }'],
+        0.8804
     ),
-    # (
-    #     ['public static int Sign ( double c ) { return ( int ) ( ( c == 0 ) ? 0 : ( c < 0 ) ? - 1 : 1) ; }'],
-    #     ['public static int Sign ( double d ) { return ( int ) ( ( d == 0 ) ? 0 : ( d < 0 ) ? - 1 : 1) ; }'],
-    #     0.8397
-    # ),
 ])
 def test_code_x_glue_readme_examples(predictions: List[Any], references: List[Any], codebleu: float) -> None:
     result = calc_codebleu(references, predictions, 'java')
-    print(result)
+    logging.debug(result)
     assert result['codebleu'] == pytest.approx(codebleu, 0.01)
 
 

From 8d4a1f16f6b8237d7bd41ac1a80edbceceef6ed9 Mon Sep 17 00:00:00 2001
From: "Maximilian R." <maximus12793@gmail.com>
Date: Mon, 2 Oct 2023 22:19:41 -0500
Subject: [PATCH 3/3] update

---
 codebleu/codebleu.py       | 2 +-
 codebleu/dataflow_match.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/codebleu/codebleu.py b/codebleu/codebleu.py
index e4d928f..1319edd 100644
--- a/codebleu/codebleu.py
+++ b/codebleu/codebleu.py
@@ -79,7 +79,7 @@ def make_weights(reference_tokens, key_word_list):
         alpha * ngram_match_score
         + beta * weighted_ngram_match_score
         + gamma * syntax_match_score
-        + theta * (dataflow_match_score or 1.0)
+        + theta * (dataflow_match_score or 1)
     )
 
     return {
diff --git a/codebleu/dataflow_match.py b/codebleu/dataflow_match.py
index 2e4217b..30e3871 100644
--- a/codebleu/dataflow_match.py
+++ b/codebleu/dataflow_match.py
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 import logging
+
 from tree_sitter import Language, Parser
 
 from .parser import (