Spaces:

toksuite
/

quick-tokenizer-accuracy

Runtime error

App Files Files Community

Gül Sena Altıntaş commited on Sep 4, 2025

Commit

41156f1

1 Parent(s): 15729bc

UI updates

Browse files

Files changed (1) hide show

app.py +197 -54

app.py CHANGED Viewed

@@ -4,7 +4,9 @@ import io
 import logging
 import os
 import re
 from collections import Counter
 from typing import Any, Dict, List
 import gradio as gr
@@ -15,6 +17,12 @@ import psutil
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 def get_memory_usage():
     """Return (gpu_mem_used_MB, gpu_mem_total_MB, ram_used_MB, ram_total_MB)"""
@@ -47,21 +55,68 @@ logger = logging.getLogger(__name__)
 # Model configurations - maps display names to HF model paths
 PREDEFINED_MODELS = [
     "meta-llama/Llama-3.2-1B",
     "google/gemma-2-2b",
     "Qwen/Qwen3-0.6B",
     "Qwen/Qwen2.5-0.5B",
     "Qwen/Qwen2.5-1.5B",
     "bigscience/bloom-560m",
     "CohereForAI/aya-expanse-8b",
     "common-pile/comma-v0.1-2t",
     "google/byt5-small",
-    "gsaltintas/supertoken_models-llama_gpt2",
-    "gsaltintas/supertoken_models-llama_google-gemma-2-2b",
-    "gsaltintas/supertoken_models-llama_google-gemma-2-2b-100b",
 ]
 # Global cache for loaded models
 model_cache = {}
 def normalize_delimiter(delim: str) -> str:
     delim = delim.strip()
@@ -124,18 +179,36 @@ def setup_tokenizer(model_path):
         from huggingface_hub import hf_hub_download, list_repo_files
         files = list_repo_files(model_path)
         if "tokenizer_config.json" in files:
             tokenizer_path = hf_hub_download(
                 repo_id=model_path, filename="tokenizer_config.json"
             )
             with open(tokenizer_path) as f:
                 tok_config = json.load(f)["data"]["tokenizer"]
-            if tok_config["name"] == "huggingface":
                 tokenizer_name = tok_config["path"]
-            # todo: tiktoken
-    tokenizer = AutoTokenizer.from_pretrained(
-        tokenizer_name, trust_remote_code=True, legacy=True
-    )
     return tokenizer
@@ -178,9 +251,9 @@ def load_model_and_tokenizer(model_path, progress_callback=None):
         # Load tokenizer
         tokenizer = setup_tokenizer(model_path)
-        # Add pad token if missing
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
         if progress_callback:
             progress_callback(
@@ -212,13 +285,16 @@ def load_model_and_tokenizer(model_path, progress_callback=None):
         error_msg = f"❌ Error loading model {model_path}: {str(e)}"
         logger.error(error_msg)
         # code.interact(local=dict(globals(), **locals()))
         if progress_callback:
             progress_callback(0.0, error_msg)
         return None
-def calculate_choice_likelihood(model, tokenizer, question, choice):
     """Calculate the log-likelihood of the choice given the question prompt"""
     try:
         prompt = f"Question: {question}\nAnswer: "
@@ -252,6 +328,19 @@ def calculate_choice_likelihood(model, tokenizer, question, choice):
         token_log_probs = log_probs.gather(2, target_ids.unsqueeze(-1)).squeeze(-1)
         total_log_prob = token_log_probs.sum().item()
         return total_log_prob
     except Exception as e:
@@ -259,7 +348,12 @@ def calculate_choice_likelihood(model, tokenizer, question, choice):
         return float("-inf")
-def evaluate_model_on_questions(model_path, questions, progress_callback=None):
     """Evaluate a single model on all questions using likelihood-based scoring"""
     model_info = load_model_and_tokenizer(
@@ -281,7 +375,7 @@ def evaluate_model_on_questions(model_path, questions, progress_callback=None):
             for choice in question["choices"]:
                 likelihood = calculate_choice_likelihood(
-                    model, tokenizer, question["question"], choice
                 )
                 choice_likelihoods[choice] = likelihood
@@ -348,9 +442,12 @@ def evaluate_model_on_questions(model_path, questions, progress_callback=None):
 def run_evaluation(
     dataset_text,
     selected_predefined,
     custom_models_text="",
     delimiter: str = "\t",
     progress=gr.Progress(),
 ):
     """Main evaluation function"""
     if not dataset_text.strip():
@@ -380,6 +477,9 @@ def run_evaluation(
     # Add predefined models
     all_models.extend(selected_predefined)
     all_models.extend(custom_models)
     if not all_models:
@@ -436,7 +536,7 @@ def run_evaluation(
                 progress(overall_progress, msg)
             model_results = evaluate_model_on_questions(
-                model_path, questions, model_progress
             )
             results[display_name] = model_results
@@ -461,6 +561,10 @@ def run_evaluation(
         questions, results, summary_stats
     )
     csv_summary = generate_csv_summary(questions, results, summary_stats)
     return (
         summary_md,
@@ -823,8 +927,10 @@ def generate_compact_summary_markdown(questions, results, summary_stats):
     header = "| Q# | Question | Correct Answer |" + "".join(
         [f" {model} |" for model in model_names]
     )
-    separator = "|" + "|".join(
-        ["-" * (len(col.strip()) + 2) for col in header.split("|")[1:]]
     )
     lines.append(header)
@@ -983,14 +1089,15 @@ css = """
 with gr.Blocks(
     title="🤖 Model Performance Comparison", theme=gr.themes.Soft(), css=css
 ) as demo:
-    with gr.Row():
-        with gr.Column(scale=2):
-            gr.Markdown("""
     # 🤖 Model Performance Comparison Tool
     Compare LLM performance on multiple-choice questions using Hugging Face models.
-    **Format**: Each line should have: `Question,Correct Answer,Choice1,Choice2,Choice3`
     💡 **Features**:
     - Model evaluation using HuggingFace transformers
@@ -998,18 +1105,16 @@ with gr.Blocks(
     - Detailed question-by-question results
     - Performance charts and statistics
     """)
-        with gr.Column(scale=1):
-            # with gr.Accordion("Delimiter Options"):
-            gr.Markdown("""
-            Enter the delimiter used in your dataset:
-            """)
-            delimiter_selector = gr.Textbox(
-                label="Delimiter",
-                placeholder="Enter a delimiter, e.g., , or \\t",
-                value="\\t",  # default
-                lines=1,
             )
-    with gr.Row():
         with gr.Column(scale=2):
             # Sample dataset selector
             sample_selector = gr.Dropdown(
@@ -1032,21 +1137,52 @@ What is the capital of France?,Paris,London,Berlin,Paris""",
                 max_lines=15,
                 elem_classes=["universal-text"],
             )
-            gr.Markdown("""
-            **Format Requirements**:
-            - Each data line: Question, Correct Answer, Choice1, Choice2, Choice3 (No header)
-            - Use commas or tabs as separators
-            """)
         with gr.Column(scale=1):
             # Model selection
             with gr.Tabs():
                 with gr.TabItem("🤖 Predefined Models"):
-                    predefined_selector = gr.CheckboxGroup(
-                        choices=PREDEFINED_MODELS,
-                        value=[PREDEFINED_MODELS[0]],
-                        label="Select from popular models",
                         interactive=True,
                     )
@@ -1068,16 +1204,14 @@ bigscience/bloom-560m""",
                     - `facebook/opt-350m`
                     - Your own fine-tuned models!
                     """)
             # Evaluate button
             evaluate_btn = gr.Button("⚡ Run Evaluation", variant="primary", scale=1)
-            gr.Markdown("""
-            **⚠️ Note**:
-            - Larger models require more GPU memory, currently we only run on CPU
-            - First run will download models (may take time)
-            - Models are cached for subsequent runs
-            """)
     # Results section
     with gr.Column(visible=True) as results_section:
@@ -1132,13 +1266,22 @@ bigscience/bloom-560m""",
         fn=update_dataset_from_sample, inputs=sample_selector, outputs=dataset_input
     )
     evaluate_btn.click(
         fn=run_evaluation,
         inputs=[
             dataset_input,
-            predefined_selector,
             custom_models_input,
             delimiter_selector,
         ],
         outputs=[
             summary_output,
@@ -1174,4 +1317,4 @@ bigscience/bloom-560m""",
     """)
 if __name__ == "__main__":
-    demo.launch()

 import logging
 import os
 import re
+import traceback
 from collections import Counter
+from time import time
 from typing import Any, Dict, List
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from supertoken import (
+    build_tokenizer,
+)
+os.environ["HF_HUB_CACHE"] = f"/scratch/{os.environ.get('USER')}/.cache/huggingface/hub"
 def get_memory_usage():
     """Return (gpu_mem_used_MB, gpu_mem_total_MB, ram_used_MB, ram_total_MB)"""
 # Model configurations - maps display names to HF model paths
 PREDEFINED_MODELS = [
     "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "meta-llama/Meta-Llama-3-8B-Instruct",
+    "meta-llama/Meta-Llama-3-8B",
+    "google/gemma-2-2b",
+    "google/gemma-2-2b-it",
+    "google/gemma-3-1b-pt",
+    "Qwen/Qwen3-0.6B",
+    "Qwen/Qwen2.5-0.5B",
+    "Qwen/Qwen2.5-1.5B",
+    # "Qwen/Qwen2.5-1.5B-Instruct",
+    "bigscience/bloom-560m",
+    "CohereForAI/aya-expanse-8b",
+    "common-pile/comma-v0.1-2t",
+    "google/byt5-small",
+    # "gsaltintas/supertoken_models-llama_gpt2",
+    # "gsaltintas/supertoken_models-llama_google-gemma-2-2b",
+    # "gsaltintas/supertoken_models-llama_google-gemma-2-2b-100b",
+    "r-three/supertoken_models-llama_google-gemma-2-2b",
+    "r-three/supertoken_models-llama_common-pile-comma-v0.1",
+    "r-three/supertoken_models-llama_meta-llama-Llama-3.2-1B",
+    "r-three/supertoken_models-llama_microsoft-Phi-3-mini-4k-instruct",
+    "r-three/supertoken_models-llama_gpt2",
+    "r-three/supertoken_models-llama_bigscience-bloom",
+    "r-three/supertoken_models-llama_facebook-xglm-564M",
+]
+INDUSTRY_MODELS = [
+    "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "meta-llama/Meta-Llama-3-8B-Instruct",
+    "meta-llama/Meta-Llama-3-8B",
     "google/gemma-2-2b",
+    "google/gemma-2-2b-it",
+    "google/gemma-3-1b-pt",
     "Qwen/Qwen3-0.6B",
     "Qwen/Qwen2.5-0.5B",
     "Qwen/Qwen2.5-1.5B",
+    # "Qwen/Qwen2.5-1.5B-Instruct",
     "bigscience/bloom-560m",
     "CohereForAI/aya-expanse-8b",
     "common-pile/comma-v0.1-2t",
     "google/byt5-small",
+]
+TOKSUITE_MODELS = [
+    # "gsaltintas/supertoken_models-llama_gpt2",
+    # "gsaltintas/supertoken_models-llama_google-gemma-2-2b",
+    # "gsaltintas/supertoken_models-llama_google-gemma-2-2b-100b",
+    "google-gemma-2-2b",
+    "common-pile-comma-v0.1",
+    "meta-llama-Llama-3.2-1B",
+    "microsoft-Phi-3-mini-4k-instruct",
+    "gpt2",
+    "bigscience-bloom",
+    "facebook-xglm-564M",
+    "mistralai-tekken",
+    "tokenmonster-englishcode-32000-consistent-v1",
+    "google-byt5-small",
 ]
 # Global cache for loaded models
 model_cache = {}
+print(os.environ.get("HF_HUB_CACHE"))
 def normalize_delimiter(delim: str) -> str:
     delim = delim.strip()
         from huggingface_hub import hf_hub_download, list_repo_files
         files = list_repo_files(model_path)
+        tokenizer = None
         if "tokenizer_config.json" in files:
             tokenizer_path = hf_hub_download(
                 repo_id=model_path, filename="tokenizer_config.json"
             )
             with open(tokenizer_path) as f:
                 tok_config = json.load(f)["data"]["tokenizer"]
                 tokenizer_name = tok_config["path"]
+                typ = tok_config["name"]
+            # tokenizer = build_tokenizer(typ, tokenizer_name).tokenizer
+            # if tok_config["name"] == "tiktoken":
+            #     tokenizer = TikTokenTokenizer.load(tokenizer_name)
+            #     logger.info(f"Using TikToken tokenizer for {tokenizer_name}")
+            # elif tok_config["name"] == "tokenmonster":
+            #     logger.info(f"Using TokenMonster tokenizer for {tokenizer_name}")
+            #     tokenizer = TokenMonsterTokenizer.load(tokenizer_name)
+            # elif tok_config["name"] == "tekken":
+            #     logger.info(f"Using Mistral tokenizer for {tokenizer_name}")
+            #     tokenizer = MistralTokenizer.load(tokenizer_name)
+            # else:
+            #     tokenizer = AutoTokenizer.from_pretrained(
+            #         tokenizer_name, trust_remote_code=True, legacy=True
+            #     )
+            tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer_name, trust_remote_code=True, legacy=True
+            )
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name, trust_remote_code=True, legacy=True
+        )
     return tokenizer
         # Load tokenizer
         tokenizer = setup_tokenizer(model_path)
+        # # Add pad token if missing
+        # if tokenizer.pad_token is None:
+        #     tokenizer.pad_token = tokenizer.eos_token
         if progress_callback:
             progress_callback(
         error_msg = f"❌ Error loading model {model_path}: {str(e)}"
         logger.error(error_msg)
+        traceback.print_exc()
         # code.interact(local=dict(globals(), **locals()))
         if progress_callback:
             progress_callback(0.0, error_msg)
         return None
+def calculate_choice_likelihood(
+    model, tokenizer, question, choice, normalization_method: str = "token-length"
+):
     """Calculate the log-likelihood of the choice given the question prompt"""
     try:
         prompt = f"Question: {question}\nAnswer: "
         token_log_probs = log_probs.gather(2, target_ids.unsqueeze(-1)).squeeze(-1)
         total_log_prob = token_log_probs.sum().item()
+        # char_len = answer_len
+        normalization_term = 1.0
+        if normalization_method == "token-length":
+            normalization_term = answer_len
+        elif normalization_method == "byte-length":
+            decoded_text = tokenizer.decode(target_ids[0]).strip()
+            byte_len = len(decoded_text.encode("utf-8"))
+            normalization_term = byte_len
+        # char_len = len(tokenizer.decode(target_ids[0]).strip())
+        # decoded_text = tokenizer.decode(target_ids[0]).strip()
+        # byte_len = len(decoded_text.encode("utf-8"))
+        # char_len = answer_len
+        total_log_prob /= normalization_term
         return total_log_prob
     except Exception as e:
         return float("-inf")
+def evaluate_model_on_questions(
+    model_path,
+    questions,
+    progress_callback=None,
+    normalization_method: str = "token-length",
+):
     """Evaluate a single model on all questions using likelihood-based scoring"""
     model_info = load_model_and_tokenizer(
             for choice in question["choices"]:
                 likelihood = calculate_choice_likelihood(
+                    model, tokenizer, question["question"], choice, normalization_method
                 )
                 choice_likelihoods[choice] = likelihood
 def run_evaluation(
     dataset_text,
     selected_predefined,
+    toksuite_selector,
     custom_models_text="",
     delimiter: str = "\t",
     progress=gr.Progress(),
+    save_summary=False,
+    normalization_method: str = "token-length",
 ):
     """Main evaluation function"""
     if not dataset_text.strip():
     # Add predefined models
     all_models.extend(selected_predefined)
+    all_models.extend(
+        [f"r-three/supertoken_models-llama_{model}" for model in toksuite_selector]
+    )
     all_models.extend(custom_models)
     if not all_models:
                 progress(overall_progress, msg)
             model_results = evaluate_model_on_questions(
+                model_path, questions, model_progress, normalization_method
             )
             results[display_name] = model_results
         questions, results, summary_stats
     )
     csv_summary = generate_csv_summary(questions, results, summary_stats)
+    slurm_id = os.environ.get("SLURM_JOB_ID", "")
+    if save_summary and slurm_id:
+        with open(f"{slurm_id}_summary_{time()}.md", "w") as f:
+            f.write(markdown_summary)
     return (
         summary_md,
     header = "| Q# | Question | Correct Answer |" + "".join(
         [f" {model} |" for model in model_names]
     )
+    separator = (
+        "|"
+        + "|".join(["-" * (len(col.strip()) + 1) for col in header.split("|")[1:-1]])
+        + "|"
     )
     lines.append(header)
 with gr.Blocks(
     title="🤖 Model Performance Comparison", theme=gr.themes.Soft(), css=css
 ) as demo:
+    gr.Markdown("""
     # 🤖 Model Performance Comparison Tool
     Compare LLM performance on multiple-choice questions using Hugging Face models.
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("""
+    **Format**: Each line should have: `Question,Correct Answer,Choice1,Choice2,Choice3` (No header). Use commas or tabs as separators.
     💡 **Features**:
     - Model evaluation using HuggingFace transformers
     - Detailed question-by-question results
     - Performance charts and statistics
     """)
+            device_str = "a single GPU" if torch.cuda.is_available() else "CPU"
+            info_str = (
+                lambda: f"""
+            **⚠️ Note**:
+            - Larger models require more GPU memory, currently we only run on {device_str}
+            - First run will download models (may take time)
+            - Models are cached for subsequent runs. Currently loaded models: {list(model_cache.keys()) if model_cache else "None"}
+            """
             )
+            info_md = gr.Markdown(info_str())
         with gr.Column(scale=2):
             # Sample dataset selector
             sample_selector = gr.Dropdown(
                 max_lines=15,
                 elem_classes=["universal-text"],
             )
+    with gr.Row():
+        with gr.Column(scale=1):
+            normalization_method = gr.Radio(
+                label="Normalization Method",
+                choices=["token-length", "byte-length", "none"],
+                value="token-length",
+                info="Method to normalize log-likelihoods when scoring answers",
+            )
         with gr.Column(scale=1):
+            # with gr.Accordion("Delimiter Options"):
+            delimiter_selector = gr.Textbox(
+                info="Delimiter used in the dataset (e.g., comma or tab)",
+                label="Delimiter",
+                placeholder="Enter a delimiter, e.g., , or \\t",
+                value="\\t",  # default
+                lines=1,
+            )
+        with gr.Column(scale=1):
+            save_summary_checkbox = False
+            slurm_id = os.environ.get("SLURM_JOB_ID", "")
+            if slurm_id:
+                save_summary_checkbox = gr.Checkbox(
+                    info=f"To save the summary markdown file, check the box below. The filename will be prefixed with {slurm_id}:",
+                    label="Save summary markdown to file",
+                    value=False,
+                    # info="If checked, saves a markdown summary file with SLURM_JOB_ID prefix",
+                )
+    with gr.Row():
+        with gr.Column(scale=2):
+            toksuite_selector = gr.CheckboxGroup(
+                label="Select toksuite models",
+                choices=TOKSUITE_MODELS,
+                value=TOKSUITE_MODELS,
+                interactive=True,
+                info="These models share the same initialization and training source but differ only in their tokenizers. See [r-three/toksuite](https://huggingface.co/collections/r-three/toksuite-68ae7490c151341d78423295) for details.",
+            )
+        with gr.Column(scale=3):
             # Model selection
             with gr.Tabs():
                 with gr.TabItem("🤖 Predefined Models"):
+                    industry_selector = gr.CheckboxGroup(
+                        # choices=PREDEFINED_MODELS,
+                        choices=INDUSTRY_MODELS,
+                        # value=[x for x in PREDEFINED_MODELS if "r-three" in x],
+                        value=[],
+                        label="Select from industry models",
                         interactive=True,
                     )
                     - `facebook/opt-350m`
                     - Your own fine-tuned models!
                     """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            pass
+        with gr.Column(scale=2):
             # Evaluate button
             evaluate_btn = gr.Button("⚡ Run Evaluation", variant="primary", scale=1)
+        with gr.Column(scale=1):
+            pass
     # Results section
     with gr.Column(visible=True) as results_section:
         fn=update_dataset_from_sample, inputs=sample_selector, outputs=dataset_input
     )
+    def update_info_md():
+        info_md.value = info_str()
+        return gr.update(value=info_str())
+    evaluate_btn.click(fn=update_info_md, inputs=None, outputs=info_md)
     evaluate_btn.click(
         fn=run_evaluation,
         inputs=[
             dataset_input,
+            # predefined_selector,
+            industry_selector,
+            toksuite_selector,
             custom_models_input,
             delimiter_selector,
+            save_summary_checkbox,
+            normalization_method,
         ],
         outputs=[
             summary_output,
     """)
 if __name__ == "__main__":
+    demo.launch(share=True)