canerkonuk commited on
Commit
233c6c0
·
verified ·
1 Parent(s): b367958

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +194 -0
app.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+
5
+ # Model configuration
6
+ MODEL_ID = "microsoft/bitnet-b1.58-2B-4T"
7
+
8
+ # Initialize model and tokenizer
9
+ print("Loading model and tokenizer...")
10
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
11
+ model = AutoModelForCausalLM.from_pretrained(
12
+ MODEL_ID,
13
+ torch_dtype=torch.bfloat16,
14
+ device_map="auto"
15
+ )
16
+ print("Model loaded successfully!")
17
+
18
+ def chat(message, history, system_prompt, max_tokens, temperature, top_p):
19
+ """
20
+ Generate a response using the BitNet model.
21
+
22
+ Args:
23
+ message: User's current message
24
+ history: List of previous messages in the conversation
25
+ system_prompt: System instruction for the model
26
+ max_tokens: Maximum number of tokens to generate
27
+ temperature: Sampling temperature
28
+ top_p: Nucleus sampling parameter
29
+ """
30
+ # Build conversation history
31
+ messages = [{"role": "system", "content": system_prompt}]
32
+
33
+ # Add conversation history
34
+ for user_msg, assistant_msg in history:
35
+ messages.append({"role": "user", "content": user_msg})
36
+ messages.append({"role": "assistant", "content": assistant_msg})
37
+
38
+ # Add current message
39
+ messages.append({"role": "user", "content": message})
40
+
41
+ # Apply chat template
42
+ prompt = tokenizer.apply_chat_template(
43
+ messages,
44
+ tokenize=False,
45
+ add_generation_prompt=True
46
+ )
47
+
48
+ # Tokenize input
49
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
50
+
51
+ # Generate response
52
+ with torch.no_grad():
53
+ outputs = model.generate(
54
+ **inputs,
55
+ max_new_tokens=max_tokens,
56
+ temperature=temperature,
57
+ top_p=top_p,
58
+ do_sample=True if temperature > 0 else False,
59
+ pad_token_id=tokenizer.eos_token_id
60
+ )
61
+
62
+ # Decode only the generated tokens (not the input)
63
+ response = tokenizer.decode(
64
+ outputs[0][inputs['input_ids'].shape[-1]:],
65
+ skip_special_tokens=True
66
+ )
67
+
68
+ return response
69
+
70
+ # Create Gradio interface
71
+ with gr.Blocks(title="BitNet b1.58 2B Chat Demo") as demo:
72
+ gr.Markdown("""
73
+ # 🚀 BitNet b1.58 2B-4T Chat Demo
74
+
75
+ This is a demo of Microsoft's BitNet b1.58 2B model - a 1.58-bit Large Language Model trained on 4 trillion tokens.
76
+
77
+ **Key Features:**
78
+ - 1.58-bit weights (ternary: {-1, 0, 1})
79
+ - Significantly reduced memory footprint
80
+ - Faster inference and lower energy consumption
81
+ - Performance comparable to full-precision LLMs of similar size
82
+
83
+ ⚠️ **Note:** This model is for research purposes. Responses may be unexpected, biased, or inaccurate.
84
+ """)
85
+
86
+ with gr.Row():
87
+ with gr.Column(scale=3):
88
+ chatbot = gr.Chatbot(
89
+ label="Conversation",
90
+ height=500,
91
+ show_copy_button=True
92
+ )
93
+
94
+ msg = gr.Textbox(
95
+ label="Your Message",
96
+ placeholder="Type your message here...",
97
+ lines=2
98
+ )
99
+
100
+ with gr.Row():
101
+ submit = gr.Button("Send", variant="primary")
102
+ clear = gr.Button("Clear")
103
+
104
+ with gr.Column(scale=1):
105
+ system_prompt = gr.Textbox(
106
+ label="System Prompt",
107
+ value="You are a helpful AI assistant.",
108
+ lines=3
109
+ )
110
+
111
+ max_tokens = gr.Slider(
112
+ minimum=50,
113
+ maximum=512,
114
+ value=256,
115
+ step=1,
116
+ label="Max Tokens"
117
+ )
118
+
119
+ temperature = gr.Slider(
120
+ minimum=0.0,
121
+ maximum=2.0,
122
+ value=0.7,
123
+ step=0.1,
124
+ label="Temperature"
125
+ )
126
+
127
+ top_p = gr.Slider(
128
+ minimum=0.0,
129
+ maximum=1.0,
130
+ value=0.9,
131
+ step=0.05,
132
+ label="Top P"
133
+ )
134
+
135
+ gr.Markdown("""
136
+ ### Parameters Guide:
137
+ - **Max Tokens:** Maximum length of response
138
+ - **Temperature:** Higher = more creative, Lower = more focused
139
+ - **Top P:** Nucleus sampling threshold
140
+ """)
141
+
142
+ # Event handlers
143
+ def user_message(user_input, history):
144
+ return "", history + [[user_input, None]]
145
+
146
+ def bot_response(history, system_prompt, max_tokens, temperature, top_p):
147
+ user_input = history[-1][0]
148
+ bot_message = chat(
149
+ user_input,
150
+ history[:-1],
151
+ system_prompt,
152
+ max_tokens,
153
+ temperature,
154
+ top_p
155
+ )
156
+ history[-1][1] = bot_message
157
+ return history
158
+
159
+ msg.submit(
160
+ user_message,
161
+ [msg, chatbot],
162
+ [msg, chatbot],
163
+ queue=False
164
+ ).then(
165
+ bot_response,
166
+ [chatbot, system_prompt, max_tokens, temperature, top_p],
167
+ chatbot
168
+ )
169
+
170
+ submit.click(
171
+ user_message,
172
+ [msg, chatbot],
173
+ [msg, chatbot],
174
+ queue=False
175
+ ).then(
176
+ bot_response,
177
+ [chatbot, system_prompt, max_tokens, temperature, top_p],
178
+ chatbot
179
+ )
180
+
181
+ clear.click(lambda: None, None, chatbot, queue=False)
182
+
183
+ gr.Markdown("""
184
+ ---
185
+ **Resources:**
186
+ - [Model Card](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T)
187
+ - [Technical Paper](https://huggingface.co/papers/2504.12285)
188
+ - [BitNet GitHub](https://github.com/microsoft/BitNet)
189
+ """)
190
+
191
+ # Launch the app
192
+ if __name__ == "__main__":
193
+ demo.queue()
194
+ demo.launch()