Spaces:

TIGER-Lab
/

MMLU-Pro

Running on CPU Upgrade

File size: 7,037 Bytes

import pandas as pd
import gradio as gr
import csv
import json
import os
import shutil
from huggingface_hub import Repository

HF_TOKEN = os.environ.get("HF_TOKEN")

SUBJECTS = ["Biology", "Business", "Chemistry", "Computer Science", "Economics", "Engineering",
            "Health", "History", "Law", "Math", "Philosophy", "Physics", "Psychology", "Other"]

MODEL_INFO = [
    "Models", "Data Source",
    "Overall",
    "Biology", "Business", "Chemistry", "Computer Science", "Economics", "Engineering",
    "Health", "History", "Law", "Math", "Philosophy", "Physics", "Psychology", "Other"]

DATA_TITLE_TYPE = ['markdown', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number',
                   'number', 'number', 'number', 'number', 'number', 'number', 'number',
                   'number', 'number']

SUBMISSION_NAME = "mmlu_pro_leaderboard_submission"
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/TIGER-Lab/", SUBMISSION_NAME)
CSV_DIR = "./mmlu_pro_leaderboard_submission/results.csv"

COLUMN_NAMES = MODEL_INFO

LEADERBOARD_INTRODUCTION = """# MMLU-Pro Leaderboard

## Introduction
We introduce MMLU-Pro, an enhanced benchmark designed to evaluate language understanding models across broader and more challenging tasks. Building on the Massive Multitask Language Understanding (MMLU) dataset, MMLU-Pro integrates more challenging, reasoning-focused questions and increases the answer choices per question from four to ten, significantly raising the difficulty and reducing the chance of success through random guessing. MMLU-Pro comprises over 12,000 rigorously curated questions from academic exams and textbooks, spanning 14 diverse domains including Biology, Business, Chemistry, Computer Science, Economics, Engineering, Health, History, Law, Math, Philosophy, Physics, Psychology, and Others.  


## What's new about MMLU-Pro

Compared to the original MMLU, there are three major differences:

- The original MMLU dataset only contains 4 options, MMLU-Pro increases it to 10 options. The increase in options will make the evaluation more realistic and challenging. The random guessing will lead to a much lower score.
- The original MMLU dataset contains mostly knowledge-driven questions without requiring much reasoning. Therefore, PPL results are normally better than CoT. In our dataset, we increase the problem difficulty and integrate more reasoning-focused problems. In MMLU-Pro, CoT can be 20% higher than PPL. 
- By increasing the distractor numbers, we significantly reduce the probability of correct guess by chance to boost the benchmark’s robustness. Specifically, with 24 different prompt styles tested, the sensitivity of model scores to prompt variations decreased from 4-5% in MMLU to just 2% in MMLU-Pro.

For detailed information about the dataset, visit our page on Hugging Face:  https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro. 

If you are interested in replicating these results or wish to evaluate your models using our dataset, access our evaluation scripts available on GitHub: https://github.com/TIGER-AI-Lab/MMLU-Pro.

If you would like to learn more details about our dataset, please check out our paper: https://arxiv.org/abs/2406.01574.

Below you can find the accuracies of different models tested on this dataset.

"""

TABLE_INTRODUCTION = """
    """

LEADERBOARD_INFO = """
## Dataset Summary
- **Questions and Options:** Each question within the dataset typically has **ten** multiple-choice options, except for some that were reduced during the manual review process to remove unreasonable choices. This increase from the original **four** options per question is designed to enhance complexity and robustness, necessitating deeper reasoning to discern the correct answer among a larger pool of potential distractors.
- **Sources:** The dataset consolidates questions from several sources:
  - **Original MMLU Questions:** Part of the dataset comes from the original MMLU dataset. We remove the trivial and ambiguous questions.
  - **STEM Website:** Hand-picking high-quality STEM problems from the Internet.
  - **TheoremQA:** High-quality human-annotated questions requiring theorems to solve.
  - **SciBench:** Science questions from college exams.
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@misc{wang2024mmlupro,
      title={MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark}, 
      author={Yubo Wang and Xueguang Ma and Ge Zhang and Yuansheng Ni and Abhranil Chandra and Shiguang Guo and Weiming Ren and Aaran Arulraj and Xuan He and Ziyan Jiang and Tianle Li and Max Ku and Kai Wang and Alex Zhuang and Rongqi Fan and Xiang Yue and Wenhu Chen},
      year={2024},
      eprint={2406.01574},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
"""

SUBMIT_INTRODUCTION = """# Submit on MMLU-Pro Leaderboard Introduction

## ⚠ Please note that you need to submit the JSON file with the following format:

```json
[
    {
        "question_id": 123,
        "question": "abc",
        "options": ["abc", "xyz", ...], 
        "answer": "ABC",
        "answer_index": 1,
        "category": "abc,
        "pred": "B",
        "model_outputs": ""
    }, ...
]
```
You can generate an output file in the above format using the evaluation script provided in our GitHub repository. For your convenience, the script and detailed instructions are available at GitHub: https://github.com/TIGER-AI-Lab/MMLU-Pro. After generating the file, please send us an email at ubo.wang.sunny@gmail.com, attaching the output file.
"""


def get_df():
    repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
    repo.git_pull()
    df = pd.read_csv(CSV_DIR)
    df = df.sort_values(by=['Overall'], ascending=False)
    # print("df[COLUMN_NAMES]", df.values.tolist())
    return df[COLUMN_NAMES]


def add_new_eval(
    input_file,
):
    if input_file is None:
        return "Error! Empty file!"

    upload_data = json.loads(input_file)
    print("upload_data:\n", upload_data)
    data_row = [f'{upload_data["Model"]}', upload_data['Overall']]
    for subject in SUBJECTS:
        data_row += [upload_data[subject]]
    print("data_row:\n", data_row)
    submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
                                 use_auth_token=HF_TOKEN, repo_type="dataset")
    submission_repo.git_pull()

    already_submitted = []
    with open(CSV_DIR, mode='r') as file:
        reader = csv.reader(file, delimiter=',')
        for row in reader:
            already_submitted.append(row[0])

    if data_row[0] not in already_submitted:
        with open(CSV_DIR, mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(data_row)
        
        submission_repo.push_to_hub()
        print('Submission Successful')
    else:
        print('The entry already exists')


def refresh_data():
    return get_df()