nirbarazida
/
promptfoo
mirror of https://github.com/promptfoo/promptfoo


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
            # yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
description: Reproduce Mistral Magistral AIME2024 benchmark

prompts:
  - |
    Solve this AIME mathematical problem step by step.

    Problem: {{question}}

    Think through this carefully and provide your final answer as a 3-digit integer (000-999).
    End with: "Therefore, the answer is [your answer]."

providers:
  - id: mistral:magistral-medium-latest
    label: Magistral Medium
    config:
      temperature: 0.7
      top_p: 0.95
      max_tokens: 40960
  - id: mistral:magistral-small-latest
    label: Magistral Small
    config:
      temperature: 0.7
      top_p: 0.95
      max_tokens: 40960

tests:
  - huggingface://datasets/sea-snell/aime-2024?split=test

defaultTest:
  assert:
    - type: llm-rubric
      value: |
        Evaluate this mathematical solution to an AIME competition problem.

        The correct answer is: {{answer}}

        Grade as PASS if and only if:
        1. The response shows clear step-by-step mathematical reasoning
        2. The final answer presented equals {{answer}} exactly
        3. The mathematical work supports the conclusion

        Grade as FAIL if the final answer is incorrect, regardless of the reasoning quality.