File size: 3,369 Bytes
8c49cb6
 
 
 
 
df66f6e
314f91a
b1a1395
b47b51e
8c49cb6
 
3dfaf22
c1b8a96
3dfaf22
b47b51e
 
 
 
d78ed99
b47b51e
7ec1b66
 
b47b51e
 
d78ed99
 
 
 
 
 
b47b51e
8c49cb6
 
7ec1b66
8b28d2b
8c49cb6
 
adb0416
c1b8a96
b47b51e
8c49cb6
 
 
b47b51e
8c49cb6
 
b47b51e
8c49cb6
 
 
 
 
 
b47b51e
8c49cb6
 
 
b47b51e
 
8c49cb6
 
b47b51e
8c49cb6
 
 
 
 
 
 
 
eed1ccd
8c49cb6
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import json
import os

import pandas as pd

from src.display.formatting import has_no_nan_values, make_clickable_model
from src.display.utils import AutoEvalColumn, EvalQueueColumn
from src.leaderboard.read_evals import get_raw_eval_results
from src.envs import RESULTS_REPO


def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
    """Creates a dataframe from all the individual experiment results"""
    raw_data = get_raw_eval_results(results_path, requests_path)
    # all_data_json = [v.to_dict() for v in raw_data]
    # print(raw_data)
    df = pd.DataFrame.from_records(json.load(open(raw_data[0])))
    print(list(df.columns))
    df['95% CI'] = " "
    # df['model']="nothing"
    # df.columns = cols
    # df.iloc[0]= create dummy
    # print(dir(AutoEvalColumn))
    df = df.sort_values(by=[AutoEvalColumn.task0.name], ascending=False)
    decimal = 1
    for i,row in df.iterrows():
        if 'lower' not in row:
            continue
        interval = '+'+str(round(row['upper'] - row['score'], decimal))+' / '+str(round(row['lower'] - row['score'], decimal))
        df.at[i,'95% CI'] = interval
    df = df[cols].round(decimals=2)

    # filter out if any of the benchmarks have not been produced
    # df = df[has_no_nan_values(df, benchmark_cols)]
    return df


def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
    """Creates the different dataframes for the evaluation queues requestes"""
    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".") and not entry.endswith(".jsonl")]
    all_evals = []

    for entry in entries:
        if ".json" in entry and 'toeval' not in entry:
            file_path = os.path.join(save_path, entry)
            with open(file_path) as fp:
                print(file_path)
                data = json.load(fp)

            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
            data[EvalQueueColumn.revision.name] = data.get("revision", "main")

            all_evals.append(data)
        elif ".md" not in entry and 'toeval' not in entry and 'results' not in entry:
            # this is a folder
            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
            for sub_entry in sub_entries:
                if 'toeval' in sub_entry:
                    continue
                file_path = os.path.join(save_path, entry, sub_entry)
                with open(file_path) as fp:
                    # print(file_path)
                    data = json.load(fp)

                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
                all_evals.append(data)

    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
    df_running = pd.DataFrame.from_records(running_list, columns=cols)
    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
    return df_finished[cols], df_running[cols], df_pending[cols]