google
diff --git a/‎lmeval/models/mock_model.py‎
Lines changed: 2 additions & 1 deletion b/‎lmeval/models/mock_model.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lmeval/prompts/multi_choices_prompts.py‎
Lines changed: 26 additions & 6 deletions b/‎lmeval/prompts/multi_choices_prompts.py‎
Lines changed: 26 additions & 6 deletions
diff --git a/‎lmeval/prompts/multi_choices_prompts_test.py‎
Lines changed: 72 additions & 1 deletion b/‎lmeval/prompts/multi_choices_prompts_test.py‎
Lines changed: 72 additions & 1 deletion
diff --git a/‎lmeval/question.py‎
Lines changed: 2 additions & 1 deletion b/‎lmeval/question.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lmeval/scorers/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎lmeval/scorers/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎lmeval/scorers/llm_rater.py‎
Lines changed: 117 additions & 0 deletions b/‎lmeval/scorers/llm_rater.py‎
Lines changed: 117 additions & 0 deletions
@@ -60,6 +60,7 @@ def generate_text(
       prompt: str,
       medias: List[Media] | Media = [],
       temperature: float | None = 0.0,
+      max_tokens: int = 4096,
       completions: int = 1) -> LMAnswer:
     # print(f"generate_text: {prompt}")
     id = "mock"
@@ -85,7 +86,7 @@ def generate_text(
 
   def batch_generate_text(
       self, prompts: list[str], medias: list[list[Media] | Media] = [],
-      temperature: float | None = 0.0,
+      temperature: float | None = 0.0, max_tokens:int = 4096,
       completions: int = 1) -> Generator[Tuple[int, LMAnswer], None, None]:
     log.info(f"mock-batch_generate_text: {len(prompts)} prompts")
     for i, prompt in enumerate(prompts):
 
@@ -39,17 +39,21 @@
 
 
 class MultiChoicesMultiAnswersPrompt(Prompt):
+    use_original_letters: bool = False
+
     def __init__(self,
                 template: str = MULTI_ANSWER_TEMPLATE,
                 name: str = "Multi Choices Multi Answer Picker",
                 description: str = "Ask the model to return the letters associated with potentially multiple correct answers",
                 task_type = TaskType.multiple_choices_multiple_answers,
                 url: str = '',
-                version: str = '1.0'):
+                version: str = '1.0',
+                use_original_letters: bool = False):
 
             super().__init__(name=name, description=description,
                             task_type=task_type, template=template, url=url,
                             version=version)
+            self.use_original_letters = use_original_letters
 
     def render(self, question: Question, task: Task) -> str:
         "Render prompt for a given question and task"
@@ -69,15 +73,18 @@ def render(self, question: Question, task: Task) -> str:
             question.letter_mapping = question.prompt_cache[version]['letter_mapping']
         else:
             possible_answers = [question.answer] + question.additional_answers +  question.choices
-            random.shuffle(possible_answers)
+            if self.use_original_letters:
+                assert len(possible_answers) == len(question.original_letters), f"Original letters {question.original_letters} should match the number of possible answers {possible_answers}"
+            else:    
+                random.shuffle(possible_answers)
 
             # Construct the list of possible answers
             choices_list = []
             letters_list = []
             letter_mapping = {}
             correct_letters = []
             for idx, answer in enumerate(possible_answers):
-                letter = ascii_uppercase[idx]
+                letter = question.original_letters[idx] if self.use_original_letters else ascii_uppercase[idx]
                 # don't put space between letter and answer it decrease accuracy...
                 choices_list.append(f"{letter}:{answer}")
                 letters_list.append(letter)
@@ -88,6 +95,10 @@ def render(self, question: Question, task: Task) -> str:
                     correct_letters.append(letter)
                 if answer in question.additional_answers:
                     correct_letters.append(letter)
+            if self.use_original_letters:
+                choices_list.sort()
+                letters_list.sort()
+                correct_letters.sort()
 
             question.answer_letter = ', '.join(correct_letters)
 
@@ -132,18 +143,21 @@ def render(self, question: Question, task: Task) -> str:
 
 
 class MultiChoicesPrompt(Prompt):
+    use_original_letters: bool = False
 
     def __init__(self,
                 template: str = TEMPLATE,
                 name: str = "Multi Choices Picker",
                 description: str = "Ask the model to return the letter associated with the correct answer",
                 task_type = TaskType.multiple_choices,
                 url: str = '',
-                version: str = '1.0'):
+                version: str = '1.0',
+                use_original_letters: bool = False):
 
             super().__init__(name=name, description=description,
                             task_type=task_type, template=template, url=url,
                             version=version)
+            self.use_original_letters = use_original_letters
 
     def render(self, question: Question, task: Task) -> str:
         "Render prompt for a given question and task"
@@ -162,20 +176,26 @@ def render(self, question: Question, task: Task) -> str:
             question.letter_mapping = question.prompt_cache[version]['letter_mapping']
         else:
             possible_answers = [question.answer] + question.choices
-            random.shuffle(possible_answers)
+            if self.use_original_letters:
+                assert len(possible_answers) == len(question.original_letters), f"Original letters {question.original_letters} should match the number of possible answers {possible_answers}"
+            else:    
+                random.shuffle(possible_answers)
 
             # Construct the list of possible answers
             choices_list = []
             letters_list = []
             letter_mapping = {}
             for idx, answer in enumerate(possible_answers):
-                letter = ascii_uppercase[idx]
+                letter = question.original_letters[idx] if self.use_original_letters else ascii_uppercase[idx]
                 # don't put space between letter and answer it decrease accuracy...
                 choices_list.append(f"{letter}:{answer}")
                 letters_list.append(letter)
                 letter_mapping[letter] = answer
                 if answer == question.answer:
                     question.answer_letter = letter
+            if self.use_original_letters:
+                choices_list.sort()
+                letters_list.sort()
 
             # flatten
             multi_choices = "\n".join(choices_list)
 
@@ -57,7 +57,50 @@ def test_multi_choices_multi_answers():
     for letter, answer in question.letter_mapping.items():
         assert f"{letter}:{answer}" in rendered_prompt
 
+def test_multi_choices_multi_answers_original_letters():
+    prompt = MultiChoicesMultiAnswersPrompt(use_original_letters=True)
+    question_text = "What is true about Paris"
+    question = Question(id=1,
+                        question=question_text,
+                        answer="It is the capital of France",
+                        additional_answers=["The Louvre is there",
+                                            "The effeil tower is there"],
+                        choices=["It is the capital of Portugal",
+                                 "It is the capital of Germany",
+                                 "The Guggenheim museum is there",
+                                 "THe MoMa is there"],
+                        original_letters=['G', 'F', 'E', 'D', 'C', 'B', 'A'])
+
+    task = Task(name="Paris Info", type=TaskType.multiple_choices_multiple_answers,
+                scorer=get_scorer(ScorerType.contains_answer_letters_insensitive))
+    rendered_prompt =  prompt.render(question, task)
+    print(prompt.template)
+    print(rendered_prompt)
+
+    assert question_text in rendered_prompt
+    for choice in question.choices:
+        assert choice in rendered_prompt
+    for answer in question.answer:
+        assert answer in rendered_prompt
+    for c in  ['A', 'B', 'C', 'D', 'E', 'F', 'G']:
+        assert f"\n{c}:" in rendered_prompt
+
+    # check that the answer letter is tied to the correct answer
+
+    assert f"{answer}" in rendered_prompt
+
+    # check that the additional answers are in the prompt
+    for additional_answer in question.additional_answers:
+        assert additional_answer in rendered_prompt
 
+    # check the mapping from letter to answer exist
+    for letter, answer in question.letter_mapping.items():
+        assert f"{letter}:{answer}" in rendered_prompt
+    # test the original order is preserved
+    for idx, answer in enumerate(
+        [question.answer] + question.additional_answers + question.choices
+    ):
+        assert f"{question.original_letters[idx]}:{answer}"in rendered_prompt
 
 def test_multi_choices():
     prompt = MultiChoicesPrompt()
@@ -83,6 +126,34 @@ def test_multi_choices():
     for letter, answer in question.letter_mapping.items():
         assert f"{letter}:{answer}" in rendered_prompt
 
+def test_multi_choices_original_letters():
+    prompt = MultiChoicesPrompt(use_original_letters=True)
+    question_text = "What is the capital of France?"
+    question = Question(id=1, question=question_text, answer="Paris",
+                        choices=["London", "Berlin", "Madrid"],
+                        original_letters=["D", "B", "A", "C"])
+    task = Task(name="City capital", type=TaskType.multiple_choices,
+                scorer=get_scorer(ScorerType.contain_text_insensitive))
+    rendered_prompt =  prompt.render(question, task)
+    print(prompt.template)
+    print(rendered_prompt)
+
+
+    assert question_text in rendered_prompt
+    for choice in question.choices:
+        assert choice in rendered_prompt
+    assert question.answer in rendered_prompt
+    for c in  ['A', 'B', 'C', 'D']:
+        assert f"\n{c}:" in rendered_prompt
+
+    # check that the answer letter is tied to the correct answer
+    assert f"{question.answer_letter}:{question.answer}" in rendered_prompt
+    for letter, answer in question.letter_mapping.items():
+        assert f"{letter}:{answer}" in rendered_prompt
+    # test the original order is preserved
+    for idx, answer in enumerate([question.answer] + question.choices):
+        assert f"{question.original_letters[idx]}:{answer}"in rendered_prompt
+
 def test_repeated_used_multi_choices():
     prompt = MultiChoicesPrompt()
     question_text = "What is the capital of France?"
@@ -122,4 +193,4 @@ def test_answer_in_choice_fail():
     task = Task(name="City capital", type=TaskType.multiple_choices,
                 scorer=get_scorer(ScorerType.contain_text_insensitive))
     with pytest.raises(AssertionError):
-        prompt.render(question, task)
+        prompt.render(question, task)
@@ -53,7 +53,8 @@ class Question(CustomModel):
     multi_choices: str = Field(default="")
     letter_mapping: dict = Field(default_factory=dict,
                                  description="Keep track of which letter is associated with which answer")
-
+    original_letters: List[str] = Field(default_factory=list,
+                                        description="Keep track of the original letters for: [anwer] + additional_answers + choices")
 
     # cache template rendering keyed by prompt version to ensure consistency
     # accross model evaluations.
 
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from .scorer import Scorer
+from .llm_rater import LLMRater
 from .loader import get_scorer, list_scorers
 from .dummy_scorer import Always0Scorer, Always1Scorer
 from .boolean_answer import BooleanAnswerScorer
@@ -38,4 +39,5 @@
     "ContainAnswerLetterInsensitive",
     "ContainAnswerLettersInsensitive",
     "PuntDetector",
-]
+    "LLMRater",
+]
@@ -0,0 +1,117 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from string import Template
+from pydantic import Field
+from typing_extensions import override
+
+from ..enums import Modality, ScorerType
+from ..logger import log
+from ..models import LMAnswer
+from ..question import Question
+from .scorer import Scorer
+
+DEFAULT_RATER_TEMPLATE = Template('''
+You are an impartial evaluator whose job is to determine if two sets of answer to a question are equivalent.
+The question is this:
+
+<question>
+$question
+</question>
+
+Here are two sets of answers:
+
+<answer1>
+$expected
+</answer1>
+
+<answer2>
+$actual
+</answer2>
+
+Rate on the scale from 0.0 to 1.0 how similar answer1 is to answer2.  Here 0.0 means they are completely different
+and 1.0 means they are semantically equivalent. Here are some rubrics to help you:
+
+1. Using the question as the context, list all the relevant facts from answer1 and compare them with the facts
+presented in answer2 to see if they are the equivalent. 
+2. Do both answers come to the same conclusion?
+3. Do not consider stylistic differences such as the tone, the writing presentation (for instance bullet points vs paragraph).
+
+Write your rating and reasoning for the rateing in json format 
+like this:
+ 
+ {
+    "score": the rating score between 0 and 1,
+    "reasoning": explain how you arrived at this rating
+ }
+ 
+''')
+
+
+def _parse_response_as_json(val:str):
+    jline = val.split('\n')
+    start = 1 if jline[0].startswith("```") else 0
+    end = -1 if jline[-1].startswith("```") else len(jline)
+    j = '\n'.join(jline[start:end])
+    return json.loads(j)
+
+
+class LLMRater(Scorer):
+    """A scorer using a LLM to rate the similiarity between the expected and actual answers.
+    """
+    class Config:
+        arbitrary_types_allowed = True  # to enable Template as an attribute
+    name: str = ScorerType.llm_rater.name
+    description: str = 'Calling a model to rate the answer on the scale from 0 to 1'
+    type: ScorerType = ScorerType.llm_rater
+    modality: Modality = Modality.text  # assume text for now
+    # The template is expect to have 3 parmeters: $question, $expectd, $actual. $question
+    # is the question asked, expected is the right answer and actual is the received answer.
+    # The prompt shoudl return JSON with a field "rating"
+    rater_prompt_template: Template = DEFAULT_RATER_TEMPLATE
+    temperature: float = Field(default=0.0)
+    max_tokens: int = Field(default=4096)
+
+    @override
+    def _score(self,
+               model_answer: LMAnswer,
+               question: Question,
+               task,
+               debug: bool = False) -> float:
+        # if model for the class is set, use it, else use the model from the answer
+        model = self.model if self.model else model_answer.model
+        assert model  # must have a model
+        prompt = self.rater_prompt_template.safe_substitute(
+            question=question.question,
+            expected=question.answer,
+            actual=model_answer.answer)
+        ans = model.generate_text(prompt=prompt,
+                                  temperature=self.temperature,
+                                  max_tokens=self.max_tokens)
+        if ans.iserror:
+            log.error('Rater failed with error %s', ans.error_reason)
+            return -1.0
+        if ans.ispunting:
+            log.error('Rater punted')
+            return 0.0
+        try:
+            jans = _parse_response_as_json(ans.answer)
+            score = jans.get('score', None)
+            assert score is not None
+            return score
+        except Exception as e:  # pylint: disable=broad-except
+            log.error('Rater json parsing failed: ans = %s, exception = %s',
+                      ans.answer, e)
+            return -1.0