Spaces:
Runtime error
Runtime error
Gül Sena Altıntaş commited on
Commit ·
41156f1
1
Parent(s): 15729bc
UI updates
Browse files
app.py
CHANGED
|
@@ -4,7 +4,9 @@ import io
|
|
| 4 |
import logging
|
| 5 |
import os
|
| 6 |
import re
|
|
|
|
| 7 |
from collections import Counter
|
|
|
|
| 8 |
from typing import Any, Dict, List
|
| 9 |
|
| 10 |
import gradio as gr
|
|
@@ -15,6 +17,12 @@ import psutil
|
|
| 15 |
import torch
|
| 16 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
def get_memory_usage():
|
| 20 |
"""Return (gpu_mem_used_MB, gpu_mem_total_MB, ram_used_MB, ram_total_MB)"""
|
|
@@ -47,21 +55,68 @@ logger = logging.getLogger(__name__)
|
|
| 47 |
# Model configurations - maps display names to HF model paths
|
| 48 |
PREDEFINED_MODELS = [
|
| 49 |
"meta-llama/Llama-3.2-1B",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
"google/gemma-2-2b",
|
|
|
|
|
|
|
| 51 |
"Qwen/Qwen3-0.6B",
|
| 52 |
"Qwen/Qwen2.5-0.5B",
|
| 53 |
"Qwen/Qwen2.5-1.5B",
|
|
|
|
| 54 |
"bigscience/bloom-560m",
|
| 55 |
"CohereForAI/aya-expanse-8b",
|
| 56 |
"common-pile/comma-v0.1-2t",
|
| 57 |
"google/byt5-small",
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
"gsaltintas/supertoken_models-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
]
|
| 62 |
# Global cache for loaded models
|
| 63 |
model_cache = {}
|
| 64 |
|
|
|
|
|
|
|
| 65 |
|
| 66 |
def normalize_delimiter(delim: str) -> str:
|
| 67 |
delim = delim.strip()
|
|
@@ -124,18 +179,36 @@ def setup_tokenizer(model_path):
|
|
| 124 |
from huggingface_hub import hf_hub_download, list_repo_files
|
| 125 |
|
| 126 |
files = list_repo_files(model_path)
|
|
|
|
| 127 |
if "tokenizer_config.json" in files:
|
| 128 |
tokenizer_path = hf_hub_download(
|
| 129 |
repo_id=model_path, filename="tokenizer_config.json"
|
| 130 |
)
|
| 131 |
with open(tokenizer_path) as f:
|
| 132 |
tok_config = json.load(f)["data"]["tokenizer"]
|
| 133 |
-
if tok_config["name"] == "huggingface":
|
| 134 |
tokenizer_name = tok_config["path"]
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
return tokenizer
|
| 140 |
|
| 141 |
|
|
@@ -178,9 +251,9 @@ def load_model_and_tokenizer(model_path, progress_callback=None):
|
|
| 178 |
# Load tokenizer
|
| 179 |
tokenizer = setup_tokenizer(model_path)
|
| 180 |
|
| 181 |
-
# Add pad token if missing
|
| 182 |
-
if tokenizer.pad_token is None:
|
| 183 |
-
|
| 184 |
|
| 185 |
if progress_callback:
|
| 186 |
progress_callback(
|
|
@@ -212,13 +285,16 @@ def load_model_and_tokenizer(model_path, progress_callback=None):
|
|
| 212 |
|
| 213 |
error_msg = f"❌ Error loading model {model_path}: {str(e)}"
|
| 214 |
logger.error(error_msg)
|
|
|
|
| 215 |
# code.interact(local=dict(globals(), **locals()))
|
| 216 |
if progress_callback:
|
| 217 |
progress_callback(0.0, error_msg)
|
| 218 |
return None
|
| 219 |
|
| 220 |
|
| 221 |
-
def calculate_choice_likelihood(
|
|
|
|
|
|
|
| 222 |
"""Calculate the log-likelihood of the choice given the question prompt"""
|
| 223 |
try:
|
| 224 |
prompt = f"Question: {question}\nAnswer: "
|
|
@@ -252,6 +328,19 @@ def calculate_choice_likelihood(model, tokenizer, question, choice):
|
|
| 252 |
token_log_probs = log_probs.gather(2, target_ids.unsqueeze(-1)).squeeze(-1)
|
| 253 |
|
| 254 |
total_log_prob = token_log_probs.sum().item()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
return total_log_prob
|
| 256 |
|
| 257 |
except Exception as e:
|
|
@@ -259,7 +348,12 @@ def calculate_choice_likelihood(model, tokenizer, question, choice):
|
|
| 259 |
return float("-inf")
|
| 260 |
|
| 261 |
|
| 262 |
-
def evaluate_model_on_questions(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
"""Evaluate a single model on all questions using likelihood-based scoring"""
|
| 264 |
|
| 265 |
model_info = load_model_and_tokenizer(
|
|
@@ -281,7 +375,7 @@ def evaluate_model_on_questions(model_path, questions, progress_callback=None):
|
|
| 281 |
|
| 282 |
for choice in question["choices"]:
|
| 283 |
likelihood = calculate_choice_likelihood(
|
| 284 |
-
model, tokenizer, question["question"], choice
|
| 285 |
)
|
| 286 |
choice_likelihoods[choice] = likelihood
|
| 287 |
|
|
@@ -348,9 +442,12 @@ def evaluate_model_on_questions(model_path, questions, progress_callback=None):
|
|
| 348 |
def run_evaluation(
|
| 349 |
dataset_text,
|
| 350 |
selected_predefined,
|
|
|
|
| 351 |
custom_models_text="",
|
| 352 |
delimiter: str = "\t",
|
| 353 |
progress=gr.Progress(),
|
|
|
|
|
|
|
| 354 |
):
|
| 355 |
"""Main evaluation function"""
|
| 356 |
if not dataset_text.strip():
|
|
@@ -380,6 +477,9 @@ def run_evaluation(
|
|
| 380 |
|
| 381 |
# Add predefined models
|
| 382 |
all_models.extend(selected_predefined)
|
|
|
|
|
|
|
|
|
|
| 383 |
all_models.extend(custom_models)
|
| 384 |
|
| 385 |
if not all_models:
|
|
@@ -436,7 +536,7 @@ def run_evaluation(
|
|
| 436 |
progress(overall_progress, msg)
|
| 437 |
|
| 438 |
model_results = evaluate_model_on_questions(
|
| 439 |
-
model_path, questions, model_progress
|
| 440 |
)
|
| 441 |
results[display_name] = model_results
|
| 442 |
|
|
@@ -461,6 +561,10 @@ def run_evaluation(
|
|
| 461 |
questions, results, summary_stats
|
| 462 |
)
|
| 463 |
csv_summary = generate_csv_summary(questions, results, summary_stats)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
|
| 465 |
return (
|
| 466 |
summary_md,
|
|
@@ -823,8 +927,10 @@ def generate_compact_summary_markdown(questions, results, summary_stats):
|
|
| 823 |
header = "| Q# | Question | Correct Answer |" + "".join(
|
| 824 |
[f" {model} |" for model in model_names]
|
| 825 |
)
|
| 826 |
-
separator =
|
| 827 |
-
|
|
|
|
|
|
|
| 828 |
)
|
| 829 |
|
| 830 |
lines.append(header)
|
|
@@ -983,14 +1089,15 @@ css = """
|
|
| 983 |
with gr.Blocks(
|
| 984 |
title="🤖 Model Performance Comparison", theme=gr.themes.Soft(), css=css
|
| 985 |
) as demo:
|
| 986 |
-
|
| 987 |
-
with gr.Column(scale=2):
|
| 988 |
-
gr.Markdown("""
|
| 989 |
# 🤖 Model Performance Comparison Tool
|
| 990 |
|
| 991 |
Compare LLM performance on multiple-choice questions using Hugging Face models.
|
| 992 |
-
|
| 993 |
-
|
|
|
|
|
|
|
|
|
|
| 994 |
|
| 995 |
💡 **Features**:
|
| 996 |
- Model evaluation using HuggingFace transformers
|
|
@@ -998,18 +1105,16 @@ with gr.Blocks(
|
|
| 998 |
- Detailed question-by-question results
|
| 999 |
- Performance charts and statistics
|
| 1000 |
""")
|
| 1001 |
-
|
| 1002 |
-
|
| 1003 |
-
|
| 1004 |
-
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
-
|
| 1008 |
-
|
| 1009 |
-
value="\\t", # default
|
| 1010 |
-
lines=1,
|
| 1011 |
)
|
| 1012 |
-
|
| 1013 |
with gr.Column(scale=2):
|
| 1014 |
# Sample dataset selector
|
| 1015 |
sample_selector = gr.Dropdown(
|
|
@@ -1032,21 +1137,52 @@ What is the capital of France?,Paris,London,Berlin,Paris""",
|
|
| 1032 |
max_lines=15,
|
| 1033 |
elem_classes=["universal-text"],
|
| 1034 |
)
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
-
|
| 1039 |
-
|
| 1040 |
-
|
| 1041 |
-
|
|
|
|
| 1042 |
with gr.Column(scale=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1043 |
# Model selection
|
| 1044 |
with gr.Tabs():
|
| 1045 |
with gr.TabItem("🤖 Predefined Models"):
|
| 1046 |
-
|
| 1047 |
-
choices=PREDEFINED_MODELS,
|
| 1048 |
-
|
| 1049 |
-
|
|
|
|
|
|
|
| 1050 |
interactive=True,
|
| 1051 |
)
|
| 1052 |
|
|
@@ -1068,16 +1204,14 @@ bigscience/bloom-560m""",
|
|
| 1068 |
- `facebook/opt-350m`
|
| 1069 |
- Your own fine-tuned models!
|
| 1070 |
""")
|
| 1071 |
-
|
|
|
|
|
|
|
|
|
|
| 1072 |
# Evaluate button
|
| 1073 |
evaluate_btn = gr.Button("⚡ Run Evaluation", variant="primary", scale=1)
|
| 1074 |
-
|
| 1075 |
-
|
| 1076 |
-
**⚠️ Note**:
|
| 1077 |
-
- Larger models require more GPU memory, currently we only run on CPU
|
| 1078 |
-
- First run will download models (may take time)
|
| 1079 |
-
- Models are cached for subsequent runs
|
| 1080 |
-
""")
|
| 1081 |
|
| 1082 |
# Results section
|
| 1083 |
with gr.Column(visible=True) as results_section:
|
|
@@ -1132,13 +1266,22 @@ bigscience/bloom-560m""",
|
|
| 1132 |
fn=update_dataset_from_sample, inputs=sample_selector, outputs=dataset_input
|
| 1133 |
)
|
| 1134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1135 |
evaluate_btn.click(
|
| 1136 |
fn=run_evaluation,
|
| 1137 |
inputs=[
|
| 1138 |
dataset_input,
|
| 1139 |
-
predefined_selector,
|
|
|
|
|
|
|
| 1140 |
custom_models_input,
|
| 1141 |
delimiter_selector,
|
|
|
|
|
|
|
| 1142 |
],
|
| 1143 |
outputs=[
|
| 1144 |
summary_output,
|
|
@@ -1174,4 +1317,4 @@ bigscience/bloom-560m""",
|
|
| 1174 |
""")
|
| 1175 |
|
| 1176 |
if __name__ == "__main__":
|
| 1177 |
-
demo.launch()
|
|
|
|
| 4 |
import logging
|
| 5 |
import os
|
| 6 |
import re
|
| 7 |
+
import traceback
|
| 8 |
from collections import Counter
|
| 9 |
+
from time import time
|
| 10 |
from typing import Any, Dict, List
|
| 11 |
|
| 12 |
import gradio as gr
|
|
|
|
| 17 |
import torch
|
| 18 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 19 |
|
| 20 |
+
from supertoken import (
|
| 21 |
+
build_tokenizer,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
os.environ["HF_HUB_CACHE"] = f"/scratch/{os.environ.get('USER')}/.cache/huggingface/hub"
|
| 25 |
+
|
| 26 |
|
| 27 |
def get_memory_usage():
|
| 28 |
"""Return (gpu_mem_used_MB, gpu_mem_total_MB, ram_used_MB, ram_total_MB)"""
|
|
|
|
| 55 |
# Model configurations - maps display names to HF model paths
|
| 56 |
PREDEFINED_MODELS = [
|
| 57 |
"meta-llama/Llama-3.2-1B",
|
| 58 |
+
"meta-llama/Llama-3.2-1B-Instruct",
|
| 59 |
+
"meta-llama/Meta-Llama-3-8B-Instruct",
|
| 60 |
+
"meta-llama/Meta-Llama-3-8B",
|
| 61 |
+
"google/gemma-2-2b",
|
| 62 |
+
"google/gemma-2-2b-it",
|
| 63 |
+
"google/gemma-3-1b-pt",
|
| 64 |
+
"Qwen/Qwen3-0.6B",
|
| 65 |
+
"Qwen/Qwen2.5-0.5B",
|
| 66 |
+
"Qwen/Qwen2.5-1.5B",
|
| 67 |
+
# "Qwen/Qwen2.5-1.5B-Instruct",
|
| 68 |
+
"bigscience/bloom-560m",
|
| 69 |
+
"CohereForAI/aya-expanse-8b",
|
| 70 |
+
"common-pile/comma-v0.1-2t",
|
| 71 |
+
"google/byt5-small",
|
| 72 |
+
# "gsaltintas/supertoken_models-llama_gpt2",
|
| 73 |
+
# "gsaltintas/supertoken_models-llama_google-gemma-2-2b",
|
| 74 |
+
# "gsaltintas/supertoken_models-llama_google-gemma-2-2b-100b",
|
| 75 |
+
"r-three/supertoken_models-llama_google-gemma-2-2b",
|
| 76 |
+
"r-three/supertoken_models-llama_common-pile-comma-v0.1",
|
| 77 |
+
"r-three/supertoken_models-llama_meta-llama-Llama-3.2-1B",
|
| 78 |
+
"r-three/supertoken_models-llama_microsoft-Phi-3-mini-4k-instruct",
|
| 79 |
+
"r-three/supertoken_models-llama_gpt2",
|
| 80 |
+
"r-three/supertoken_models-llama_bigscience-bloom",
|
| 81 |
+
"r-three/supertoken_models-llama_facebook-xglm-564M",
|
| 82 |
+
]
|
| 83 |
+
INDUSTRY_MODELS = [
|
| 84 |
+
"meta-llama/Llama-3.2-1B",
|
| 85 |
+
"meta-llama/Llama-3.2-1B-Instruct",
|
| 86 |
+
"meta-llama/Meta-Llama-3-8B-Instruct",
|
| 87 |
+
"meta-llama/Meta-Llama-3-8B",
|
| 88 |
"google/gemma-2-2b",
|
| 89 |
+
"google/gemma-2-2b-it",
|
| 90 |
+
"google/gemma-3-1b-pt",
|
| 91 |
"Qwen/Qwen3-0.6B",
|
| 92 |
"Qwen/Qwen2.5-0.5B",
|
| 93 |
"Qwen/Qwen2.5-1.5B",
|
| 94 |
+
# "Qwen/Qwen2.5-1.5B-Instruct",
|
| 95 |
"bigscience/bloom-560m",
|
| 96 |
"CohereForAI/aya-expanse-8b",
|
| 97 |
"common-pile/comma-v0.1-2t",
|
| 98 |
"google/byt5-small",
|
| 99 |
+
]
|
| 100 |
+
TOKSUITE_MODELS = [
|
| 101 |
+
# "gsaltintas/supertoken_models-llama_gpt2",
|
| 102 |
+
# "gsaltintas/supertoken_models-llama_google-gemma-2-2b",
|
| 103 |
+
# "gsaltintas/supertoken_models-llama_google-gemma-2-2b-100b",
|
| 104 |
+
"google-gemma-2-2b",
|
| 105 |
+
"common-pile-comma-v0.1",
|
| 106 |
+
"meta-llama-Llama-3.2-1B",
|
| 107 |
+
"microsoft-Phi-3-mini-4k-instruct",
|
| 108 |
+
"gpt2",
|
| 109 |
+
"bigscience-bloom",
|
| 110 |
+
"facebook-xglm-564M",
|
| 111 |
+
"mistralai-tekken",
|
| 112 |
+
"tokenmonster-englishcode-32000-consistent-v1",
|
| 113 |
+
"google-byt5-small",
|
| 114 |
]
|
| 115 |
# Global cache for loaded models
|
| 116 |
model_cache = {}
|
| 117 |
|
| 118 |
+
print(os.environ.get("HF_HUB_CACHE"))
|
| 119 |
+
|
| 120 |
|
| 121 |
def normalize_delimiter(delim: str) -> str:
|
| 122 |
delim = delim.strip()
|
|
|
|
| 179 |
from huggingface_hub import hf_hub_download, list_repo_files
|
| 180 |
|
| 181 |
files = list_repo_files(model_path)
|
| 182 |
+
tokenizer = None
|
| 183 |
if "tokenizer_config.json" in files:
|
| 184 |
tokenizer_path = hf_hub_download(
|
| 185 |
repo_id=model_path, filename="tokenizer_config.json"
|
| 186 |
)
|
| 187 |
with open(tokenizer_path) as f:
|
| 188 |
tok_config = json.load(f)["data"]["tokenizer"]
|
|
|
|
| 189 |
tokenizer_name = tok_config["path"]
|
| 190 |
+
typ = tok_config["name"]
|
| 191 |
+
# tokenizer = build_tokenizer(typ, tokenizer_name).tokenizer
|
| 192 |
+
# if tok_config["name"] == "tiktoken":
|
| 193 |
+
# tokenizer = TikTokenTokenizer.load(tokenizer_name)
|
| 194 |
+
# logger.info(f"Using TikToken tokenizer for {tokenizer_name}")
|
| 195 |
+
# elif tok_config["name"] == "tokenmonster":
|
| 196 |
+
# logger.info(f"Using TokenMonster tokenizer for {tokenizer_name}")
|
| 197 |
+
# tokenizer = TokenMonsterTokenizer.load(tokenizer_name)
|
| 198 |
+
# elif tok_config["name"] == "tekken":
|
| 199 |
+
# logger.info(f"Using Mistral tokenizer for {tokenizer_name}")
|
| 200 |
+
# tokenizer = MistralTokenizer.load(tokenizer_name)
|
| 201 |
+
# else:
|
| 202 |
+
# tokenizer = AutoTokenizer.from_pretrained(
|
| 203 |
+
# tokenizer_name, trust_remote_code=True, legacy=True
|
| 204 |
+
# )
|
| 205 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 206 |
+
tokenizer_name, trust_remote_code=True, legacy=True
|
| 207 |
+
)
|
| 208 |
+
else:
|
| 209 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 210 |
+
tokenizer_name, trust_remote_code=True, legacy=True
|
| 211 |
+
)
|
| 212 |
return tokenizer
|
| 213 |
|
| 214 |
|
|
|
|
| 251 |
# Load tokenizer
|
| 252 |
tokenizer = setup_tokenizer(model_path)
|
| 253 |
|
| 254 |
+
# # Add pad token if missing
|
| 255 |
+
# if tokenizer.pad_token is None:
|
| 256 |
+
# tokenizer.pad_token = tokenizer.eos_token
|
| 257 |
|
| 258 |
if progress_callback:
|
| 259 |
progress_callback(
|
|
|
|
| 285 |
|
| 286 |
error_msg = f"❌ Error loading model {model_path}: {str(e)}"
|
| 287 |
logger.error(error_msg)
|
| 288 |
+
traceback.print_exc()
|
| 289 |
# code.interact(local=dict(globals(), **locals()))
|
| 290 |
if progress_callback:
|
| 291 |
progress_callback(0.0, error_msg)
|
| 292 |
return None
|
| 293 |
|
| 294 |
|
| 295 |
+
def calculate_choice_likelihood(
|
| 296 |
+
model, tokenizer, question, choice, normalization_method: str = "token-length"
|
| 297 |
+
):
|
| 298 |
"""Calculate the log-likelihood of the choice given the question prompt"""
|
| 299 |
try:
|
| 300 |
prompt = f"Question: {question}\nAnswer: "
|
|
|
|
| 328 |
token_log_probs = log_probs.gather(2, target_ids.unsqueeze(-1)).squeeze(-1)
|
| 329 |
|
| 330 |
total_log_prob = token_log_probs.sum().item()
|
| 331 |
+
# char_len = answer_len
|
| 332 |
+
normalization_term = 1.0
|
| 333 |
+
if normalization_method == "token-length":
|
| 334 |
+
normalization_term = answer_len
|
| 335 |
+
elif normalization_method == "byte-length":
|
| 336 |
+
decoded_text = tokenizer.decode(target_ids[0]).strip()
|
| 337 |
+
byte_len = len(decoded_text.encode("utf-8"))
|
| 338 |
+
normalization_term = byte_len
|
| 339 |
+
# char_len = len(tokenizer.decode(target_ids[0]).strip())
|
| 340 |
+
# decoded_text = tokenizer.decode(target_ids[0]).strip()
|
| 341 |
+
# byte_len = len(decoded_text.encode("utf-8"))
|
| 342 |
+
# char_len = answer_len
|
| 343 |
+
total_log_prob /= normalization_term
|
| 344 |
return total_log_prob
|
| 345 |
|
| 346 |
except Exception as e:
|
|
|
|
| 348 |
return float("-inf")
|
| 349 |
|
| 350 |
|
| 351 |
+
def evaluate_model_on_questions(
|
| 352 |
+
model_path,
|
| 353 |
+
questions,
|
| 354 |
+
progress_callback=None,
|
| 355 |
+
normalization_method: str = "token-length",
|
| 356 |
+
):
|
| 357 |
"""Evaluate a single model on all questions using likelihood-based scoring"""
|
| 358 |
|
| 359 |
model_info = load_model_and_tokenizer(
|
|
|
|
| 375 |
|
| 376 |
for choice in question["choices"]:
|
| 377 |
likelihood = calculate_choice_likelihood(
|
| 378 |
+
model, tokenizer, question["question"], choice, normalization_method
|
| 379 |
)
|
| 380 |
choice_likelihoods[choice] = likelihood
|
| 381 |
|
|
|
|
| 442 |
def run_evaluation(
|
| 443 |
dataset_text,
|
| 444 |
selected_predefined,
|
| 445 |
+
toksuite_selector,
|
| 446 |
custom_models_text="",
|
| 447 |
delimiter: str = "\t",
|
| 448 |
progress=gr.Progress(),
|
| 449 |
+
save_summary=False,
|
| 450 |
+
normalization_method: str = "token-length",
|
| 451 |
):
|
| 452 |
"""Main evaluation function"""
|
| 453 |
if not dataset_text.strip():
|
|
|
|
| 477 |
|
| 478 |
# Add predefined models
|
| 479 |
all_models.extend(selected_predefined)
|
| 480 |
+
all_models.extend(
|
| 481 |
+
[f"r-three/supertoken_models-llama_{model}" for model in toksuite_selector]
|
| 482 |
+
)
|
| 483 |
all_models.extend(custom_models)
|
| 484 |
|
| 485 |
if not all_models:
|
|
|
|
| 536 |
progress(overall_progress, msg)
|
| 537 |
|
| 538 |
model_results = evaluate_model_on_questions(
|
| 539 |
+
model_path, questions, model_progress, normalization_method
|
| 540 |
)
|
| 541 |
results[display_name] = model_results
|
| 542 |
|
|
|
|
| 561 |
questions, results, summary_stats
|
| 562 |
)
|
| 563 |
csv_summary = generate_csv_summary(questions, results, summary_stats)
|
| 564 |
+
slurm_id = os.environ.get("SLURM_JOB_ID", "")
|
| 565 |
+
if save_summary and slurm_id:
|
| 566 |
+
with open(f"{slurm_id}_summary_{time()}.md", "w") as f:
|
| 567 |
+
f.write(markdown_summary)
|
| 568 |
|
| 569 |
return (
|
| 570 |
summary_md,
|
|
|
|
| 927 |
header = "| Q# | Question | Correct Answer |" + "".join(
|
| 928 |
[f" {model} |" for model in model_names]
|
| 929 |
)
|
| 930 |
+
separator = (
|
| 931 |
+
"|"
|
| 932 |
+
+ "|".join(["-" * (len(col.strip()) + 1) for col in header.split("|")[1:-1]])
|
| 933 |
+
+ "|"
|
| 934 |
)
|
| 935 |
|
| 936 |
lines.append(header)
|
|
|
|
| 1089 |
with gr.Blocks(
|
| 1090 |
title="🤖 Model Performance Comparison", theme=gr.themes.Soft(), css=css
|
| 1091 |
) as demo:
|
| 1092 |
+
gr.Markdown("""
|
|
|
|
|
|
|
| 1093 |
# 🤖 Model Performance Comparison Tool
|
| 1094 |
|
| 1095 |
Compare LLM performance on multiple-choice questions using Hugging Face models.
|
| 1096 |
+
""")
|
| 1097 |
+
with gr.Row():
|
| 1098 |
+
with gr.Column(scale=1):
|
| 1099 |
+
gr.Markdown("""
|
| 1100 |
+
**Format**: Each line should have: `Question,Correct Answer,Choice1,Choice2,Choice3` (No header). Use commas or tabs as separators.
|
| 1101 |
|
| 1102 |
💡 **Features**:
|
| 1103 |
- Model evaluation using HuggingFace transformers
|
|
|
|
| 1105 |
- Detailed question-by-question results
|
| 1106 |
- Performance charts and statistics
|
| 1107 |
""")
|
| 1108 |
+
device_str = "a single GPU" if torch.cuda.is_available() else "CPU"
|
| 1109 |
+
info_str = (
|
| 1110 |
+
lambda: f"""
|
| 1111 |
+
**⚠️ Note**:
|
| 1112 |
+
- Larger models require more GPU memory, currently we only run on {device_str}
|
| 1113 |
+
- First run will download models (may take time)
|
| 1114 |
+
- Models are cached for subsequent runs. Currently loaded models: {list(model_cache.keys()) if model_cache else "None"}
|
| 1115 |
+
"""
|
|
|
|
|
|
|
| 1116 |
)
|
| 1117 |
+
info_md = gr.Markdown(info_str())
|
| 1118 |
with gr.Column(scale=2):
|
| 1119 |
# Sample dataset selector
|
| 1120 |
sample_selector = gr.Dropdown(
|
|
|
|
| 1137 |
max_lines=15,
|
| 1138 |
elem_classes=["universal-text"],
|
| 1139 |
)
|
| 1140 |
+
with gr.Row():
|
| 1141 |
+
with gr.Column(scale=1):
|
| 1142 |
+
normalization_method = gr.Radio(
|
| 1143 |
+
label="Normalization Method",
|
| 1144 |
+
choices=["token-length", "byte-length", "none"],
|
| 1145 |
+
value="token-length",
|
| 1146 |
+
info="Method to normalize log-likelihoods when scoring answers",
|
| 1147 |
+
)
|
| 1148 |
with gr.Column(scale=1):
|
| 1149 |
+
# with gr.Accordion("Delimiter Options"):
|
| 1150 |
+
delimiter_selector = gr.Textbox(
|
| 1151 |
+
info="Delimiter used in the dataset (e.g., comma or tab)",
|
| 1152 |
+
label="Delimiter",
|
| 1153 |
+
placeholder="Enter a delimiter, e.g., , or \\t",
|
| 1154 |
+
value="\\t", # default
|
| 1155 |
+
lines=1,
|
| 1156 |
+
)
|
| 1157 |
+
with gr.Column(scale=1):
|
| 1158 |
+
save_summary_checkbox = False
|
| 1159 |
+
slurm_id = os.environ.get("SLURM_JOB_ID", "")
|
| 1160 |
+
if slurm_id:
|
| 1161 |
+
save_summary_checkbox = gr.Checkbox(
|
| 1162 |
+
info=f"To save the summary markdown file, check the box below. The filename will be prefixed with {slurm_id}:",
|
| 1163 |
+
label="Save summary markdown to file",
|
| 1164 |
+
value=False,
|
| 1165 |
+
# info="If checked, saves a markdown summary file with SLURM_JOB_ID prefix",
|
| 1166 |
+
)
|
| 1167 |
+
with gr.Row():
|
| 1168 |
+
with gr.Column(scale=2):
|
| 1169 |
+
toksuite_selector = gr.CheckboxGroup(
|
| 1170 |
+
label="Select toksuite models",
|
| 1171 |
+
choices=TOKSUITE_MODELS,
|
| 1172 |
+
value=TOKSUITE_MODELS,
|
| 1173 |
+
interactive=True,
|
| 1174 |
+
info="These models share the same initialization and training source but differ only in their tokenizers. See [r-three/toksuite](https://huggingface.co/collections/r-three/toksuite-68ae7490c151341d78423295) for details.",
|
| 1175 |
+
)
|
| 1176 |
+
with gr.Column(scale=3):
|
| 1177 |
# Model selection
|
| 1178 |
with gr.Tabs():
|
| 1179 |
with gr.TabItem("🤖 Predefined Models"):
|
| 1180 |
+
industry_selector = gr.CheckboxGroup(
|
| 1181 |
+
# choices=PREDEFINED_MODELS,
|
| 1182 |
+
choices=INDUSTRY_MODELS,
|
| 1183 |
+
# value=[x for x in PREDEFINED_MODELS if "r-three" in x],
|
| 1184 |
+
value=[],
|
| 1185 |
+
label="Select from industry models",
|
| 1186 |
interactive=True,
|
| 1187 |
)
|
| 1188 |
|
|
|
|
| 1204 |
- `facebook/opt-350m`
|
| 1205 |
- Your own fine-tuned models!
|
| 1206 |
""")
|
| 1207 |
+
with gr.Row():
|
| 1208 |
+
with gr.Column(scale=1):
|
| 1209 |
+
pass
|
| 1210 |
+
with gr.Column(scale=2):
|
| 1211 |
# Evaluate button
|
| 1212 |
evaluate_btn = gr.Button("⚡ Run Evaluation", variant="primary", scale=1)
|
| 1213 |
+
with gr.Column(scale=1):
|
| 1214 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1215 |
|
| 1216 |
# Results section
|
| 1217 |
with gr.Column(visible=True) as results_section:
|
|
|
|
| 1266 |
fn=update_dataset_from_sample, inputs=sample_selector, outputs=dataset_input
|
| 1267 |
)
|
| 1268 |
|
| 1269 |
+
def update_info_md():
|
| 1270 |
+
info_md.value = info_str()
|
| 1271 |
+
return gr.update(value=info_str())
|
| 1272 |
+
|
| 1273 |
+
evaluate_btn.click(fn=update_info_md, inputs=None, outputs=info_md)
|
| 1274 |
evaluate_btn.click(
|
| 1275 |
fn=run_evaluation,
|
| 1276 |
inputs=[
|
| 1277 |
dataset_input,
|
| 1278 |
+
# predefined_selector,
|
| 1279 |
+
industry_selector,
|
| 1280 |
+
toksuite_selector,
|
| 1281 |
custom_models_input,
|
| 1282 |
delimiter_selector,
|
| 1283 |
+
save_summary_checkbox,
|
| 1284 |
+
normalization_method,
|
| 1285 |
],
|
| 1286 |
outputs=[
|
| 1287 |
summary_output,
|
|
|
|
| 1317 |
""")
|
| 1318 |
|
| 1319 |
if __name__ == "__main__":
|
| 1320 |
+
demo.launch(share=True)
|