diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py index 92c2c574a..00e1c290c 100644 --- a/src/lighteval/metrics/metrics_corpus.py +++ b/src/lighteval/metrics/metrics_corpus.py @@ -28,6 +28,7 @@ import logging import math from abc import ABC, abstractmethod +from itertools import zip_longest from typing import Literal import numpy as np @@ -152,8 +153,10 @@ def compute_corpus(self, items: list[GenerativeCorpusMetricInput]) -> float: ) preds.append(pred[0]) - if self.metric_type == "bleu": - golds = [[gold[0] for gold in golds]] + # sacrebleu expects references in [ref_id][sent_id] shape, but golds is + # collected as [sent_id][ref_id]. Transpose so each inner list is one + # reference stream across all hypotheses. + golds = [list(refs) for refs in zip_longest(*golds, fillvalue=None)] corpus_score = metric.corpus_score(hypotheses=preds, references=golds) score = corpus_score.score diff --git a/tests/unit/metrics/test_cases/chrf.json b/tests/unit/metrics/test_cases/chrf.json index f55028674..0afa2688d 100644 --- a/tests/unit/metrics/test_cases/chrf.json +++ b/tests/unit/metrics/test_cases/chrf.json @@ -79,9 +79,9 @@ "text": ["Das Wetter ist schön"] } ], - "expected_output": 100.0, + "expected_output": 89.16, "tolerance": 0.1, - "description": "High similarity - minor character differences (CHRF ≈ 88.0)" + "description": "High similarity - minor character differences (CHRF ≈ 89.0)" }, { "name": "CHRF - Word Order Changes", @@ -119,9 +119,9 @@ "text": ["Lernen Maschinelles"] } ], - "expected_output": 78.84, + "expected_output": 82.21, "tolerance": 0.1, - "description": "Word order changes - same characters, different order (CHRF ≈ 75.0)" + "description": "Word order changes - same characters, different order (CHRF ≈ 82.0)" }, { "name": "CHRF - Moderate Similarity", @@ -159,9 +159,9 @@ "text": ["Die Sterne"] } ], - "expected_output": 37.68, + "expected_output": 34.85, "tolerance": 0.1, - "description": "Moderate similarity - partial character overlap (CHRF ≈ 50.0)" + "description": "Moderate similarity - partial character overlap (CHRF ≈ 35.0)" }, { "name": "CHRF - Low Similarity", @@ -199,9 +199,9 @@ "text": ["Es sehr heiß"] } ], - "expected_output": 7.7, + "expected_output": 6.83, "tolerance": 0.1, - "description": "Low similarity - minimal character overlap (CHRF ≈ 20.0)" + "description": "Low similarity - minimal character overlap (CHRF ≈ 7.0)" } ] } diff --git a/tests/unit/metrics/test_cases/chrf_plus.json b/tests/unit/metrics/test_cases/chrf_plus.json index 29c45720d..2965a484e 100644 --- a/tests/unit/metrics/test_cases/chrf_plus.json +++ b/tests/unit/metrics/test_cases/chrf_plus.json @@ -79,9 +79,9 @@ "text": ["Das Wetter ist schön"] } ], - "expected_output": 100.0, + "expected_output": 88.07, "tolerance": 0.1, - "description": "High similarity - minor character differences (CHRF++ ≈ 85.0)" + "description": "High similarity - minor character differences (CHRF++ ≈ 88.0)" }, { "name": "CHRF Plus - Moderate Similarity", @@ -119,9 +119,9 @@ "text": ["ML"] } ], - "expected_output": 58.82, + "expected_output": 24.45, "tolerance": 0.1, - "description": "Moderate similarity - significant character omissions (CHRF++ ≈ 45.0)" + "description": "Moderate similarity - significant character omissions (CHRF++ ≈ 24.0)" }, { "name": "CHRF Plus - Low Similarity", @@ -159,9 +159,9 @@ "text": ["Es heiß"] } ], - "expected_output": 15.0, - "tolerance": 10.0, - "description": "Low similarity - minimal character overlap (CHRF++ ≈ 15.0)" + "expected_output": 2.73, + "tolerance": 0.1, + "description": "Low similarity - minimal character overlap (CHRF++ ≈ 2.7)" } ] } diff --git a/tests/unit/metrics/test_cases/ter.json b/tests/unit/metrics/test_cases/ter.json index 39b671b0f..691205a6b 100644 --- a/tests/unit/metrics/test_cases/ter.json +++ b/tests/unit/metrics/test_cases/ter.json @@ -79,9 +79,9 @@ "text": ["Das Wetter ist schön"] } ], - "expected_output": 0.0, - "tolerance": 0.05, - "description": "Minor edits - small word differences" + "expected_output": 9.09, + "tolerance": 0.1, + "description": "Minor edits - one missing word across 3 sentences" }, { "name": "TER - Major Edits", @@ -159,7 +159,7 @@ "text": ["Es ist sehr heiß"] } ], - "expected_output": 80.0, + "expected_output": 100.0, "tolerance": 0.1, "description": "Completely different translations - maximum edit distance" }