Spaces:
Sleeping
Sleeping
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
| import torch | |
| class ModelLoader: | |
| def __init__(self, model_name, hugging_face_token): | |
| self.model_name = model_name | |
| # Configure 4-bit quantization | |
| self.bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| llm_int8_enable_fp32_cpu_offload=True | |
| ) | |
| # Load tokenizer | |
| self.tokenizer = AutoTokenizer.from_pretrained( | |
| model_name, | |
| token=hugging_face_token | |
| ) | |
| # Load model with memory optimizations | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| quantization_config=self.bnb_config, | |
| device_map="auto", | |
| low_cpu_mem_usage=True, | |
| max_memory={ | |
| "cpu": "12GiB", | |
| "cuda:0": "4GiB", | |
| }, | |
| token=hugging_face_token | |
| ) |