Priyanka6 commited on
Commit
6114841
Β·
1 Parent(s): 7d51297

Update space

Browse files
Files changed (1) hide show
  1. app.py +148 -33
app.py CHANGED
@@ -1,24 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  import gradio as gr
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- # Load the model and tokenizer only once at startup
6
- MODEL_NAME = "sarvamai/sarvam-1"
7
- tokenizer = None
8
- model = None
9
-
10
- def load_model():
11
- global tokenizer, model
12
- if tokenizer is None or model is None:
13
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
14
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto")
15
- model.eval()
16
-
17
- def respond(message, history, max_tokens, temperature, top_p):
18
- global tokenizer, model
19
- # Ensure model is loaded
20
- load_model()
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # Convert chat history to format
23
  messages = [{"role": "system", "content": "You are a friendly AI assistant."}]
24
  for val in history:
@@ -31,11 +148,8 @@ def respond(message, history, max_tokens, temperature, top_p):
31
  # Tokenize and generate response
32
  inputs = tokenizer.apply_chat_template(messages, tokenize=False)
33
  input_tokens = tokenizer(inputs, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
34
-
35
- # Get the length of input tokens to separate new response
36
- input_length = input_tokens.input_ids.shape[1]
37
 
38
- output_tokens = model.generate(
39
  **input_tokens,
40
  max_new_tokens=max_tokens,
41
  temperature=temperature,
@@ -43,29 +157,30 @@ def respond(message, history, max_tokens, temperature, top_p):
43
  pad_token_id=tokenizer.pad_token_id,
44
  eos_token_id=tokenizer.eos_token_id,
45
  )
46
-
47
- # Extract only the new tokens (the model's response)
48
- new_tokens = output_tokens[0][input_length:]
49
- response = tokenizer.decode(new_tokens, skip_special_tokens=True)
50
-
51
- # Clean up any remaining system prompt or formatting artifacts
52
- response = response.strip()
53
- if response.startswith("assistant:"):
54
- response = response[len("assistant:"):].strip()
55
-
56
  return response
57
 
58
  # Define Gradio Chat Interface
59
  demo = gr.ChatInterface(
60
  fn=respond,
61
  additional_inputs=[
 
62
  gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max Tokens"),
63
  gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
64
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
65
  ],
66
- title="Sarvam-1 Chat Interface",
67
- description="Chat with the Sarvam-1 language model"
68
  )
69
 
70
  if __name__ == "__main__":
71
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
1
+ # # import gradio as gr
2
+ # # from huggingface_hub import InferenceClient
3
+
4
+ # # """
5
+ # # For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
+ # # """
7
+ # # client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
+
9
+
10
+ # # def respond(
11
+ # # message,
12
+ # # history: list[tuple[str, str]],
13
+ # # system_message,
14
+ # # max_tokens,
15
+ # # temperature,
16
+ # # top_p,
17
+ # # ):
18
+ # # messages = [{"role": "system", "content": system_message}]
19
+
20
+ # # for val in history:
21
+ # # if val[0]:
22
+ # # messages.append({"role": "user", "content": val[0]})
23
+ # # if val[1]:
24
+ # # messages.append({"role": "assistant", "content": val[1]})
25
+
26
+ # # messages.append({"role": "user", "content": message})
27
+
28
+ # # response = ""
29
+
30
+ # # for message in client.chat_completion(
31
+ # # messages,
32
+ # # max_tokens=max_tokens,
33
+ # # stream=True,
34
+ # # temperature=temperature,
35
+ # # top_p=top_p,
36
+ # # ):
37
+ # # token = message.choices[0].delta.content
38
+
39
+ # # response += token
40
+ # # yield response
41
+
42
+
43
+ # # """
44
+ # # For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
+ # # """
46
+ # # demo = gr.ChatInterface(
47
+ # # respond,
48
+ # # additional_inputs=[
49
+ # # gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
+ # # gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
+ # # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
+ # # gr.Slider(
53
+ # # minimum=0.1,
54
+ # # maximum=1.0,
55
+ # # value=0.95,
56
+ # # step=0.05,
57
+ # # label="Top-p (nucleus sampling)",
58
+ # # ),
59
+ # # ],
60
+ # # )
61
+
62
+
63
+ # # if __name__ == "__main__":
64
+ # # demo.launch()
65
+
66
  import torch
67
  import gradio as gr
68
  from transformers import AutoModelForCausalLM, AutoTokenizer
69
+ import os
70
+ from safetensors.torch import load_file, save_file
71
+
72
+ # Define model names
73
+ # MODEL_1_PATH = "./adapter_model.safetensors" # Local path inside Space
74
+ ###
75
+ MODEL_1_PATH = "Priyanka6/fine-tuning-inference"
76
+ ###
77
+ MODEL_2_NAME = "sarvamai/sarvam-1" # The base model on Hugging Face Hub
78
+ # MODEL_3_NAME =
79
+
80
+ def trim_adapter_weights(model_path):
81
+ """
82
+ Trims the last token from the adapter's lm_head.lora_B.default.weight
83
+ if there is a mismatch with the base model.
84
+ """
85
+ model_path = "./adapter_model.safetensors"
86
+ # if not os.path.exists(model_path):
87
+ # raise FileNotFoundError(f"Adapter file not found: {model_path}")
88
+
89
+ checkpoint = load_file(model_path)
90
+ print("Keys in checkpoint:", list(checkpoint.keys()))
91
 
92
+ key_to_trim = "lm_head.lora_B.default.weight"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
+ if key_to_trim in checkpoint:
95
+ print("Entered")
96
+ original_size = checkpoint[key_to_trim].shape[0]
97
+ expected_size = original_size - 1 # Removing last token
98
+
99
+ print(f"Trimming {key_to_trim}: {original_size} -> {expected_size}")
100
+
101
+ checkpoint[key_to_trim] = checkpoint[key_to_trim][:-1] # Trim the last row
102
+
103
+ # Save the modified adapter
104
+ trimmed_adapter_path = os.path.join(model_path, "adapter_model_trimmed.safetensors")
105
+ save_file(checkpoint, trimmed_adapter_path)
106
+ return trimmed_adapter_path
107
+ print("did execute the if block")
108
+ return model_path
109
+ model_path=os.path.join(MODEL_1_PATH,"adapter_model.safetensors")
110
+ trimmed_adapter_path = trim_adapter_weights(model_path)
111
+
112
+ # Load the tokenizer (same for both models)
113
+ TOKENIZER_NAME = "sarvamai/sarvam-1"
114
+ tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
115
+
116
+ # Function to load a model
117
+ def load_model(model_choice):
118
+ if model_choice == "Hugging face dataset":
119
+ model = AutoModelForCausalLM.from_pretrained("./", torch_dtype=torch.float16, device_map="auto")
120
+ trimmed_adapter_path = os.path.join("Priyanka6/fine-tuning-inference", "adapter_model_trimmed.safetensors")
121
+ model.load_adapter(trimmed_adapter_path, "safe_tensors") # Load safetensors adapter
122
+ else:
123
+ model = AutoModelForCausalLM.from_pretrained(MODEL_2_NAME)
124
+ model.eval()
125
+ return model
126
+
127
+ # Load default model on startup
128
+ current_model = load_model("Hugging face dataset")
129
+
130
+ # Chatbot response function
131
+ def respond(message, history, model_choice, max_tokens, temperature, top_p):
132
+ global current_model
133
+
134
+ # Switch model if user selects a different one
135
+ if (model_choice == "Hugging face dataset" and current_model is not None and current_model.config.name_or_path != MODEL_1_PATH) or \
136
+ (model_choice == "Proprietary dataset1" and current_model is not None and current_model.config.name_or_path != MODEL_2_NAME):
137
+ current_model = load_model(model_choice)
138
+
139
  # Convert chat history to format
140
  messages = [{"role": "system", "content": "You are a friendly AI assistant."}]
141
  for val in history:
 
148
  # Tokenize and generate response
149
  inputs = tokenizer.apply_chat_template(messages, tokenize=False)
150
  input_tokens = tokenizer(inputs, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
151
 
152
+ output_tokens = current_model.generate(
153
  **input_tokens,
154
  max_new_tokens=max_tokens,
155
  temperature=temperature,
 
157
  pad_token_id=tokenizer.pad_token_id,
158
  eos_token_id=tokenizer.eos_token_id,
159
  )
160
+
161
+ response = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
162
  return response
163
 
164
  # Define Gradio Chat Interface
165
  demo = gr.ChatInterface(
166
  fn=respond,
167
  additional_inputs=[
168
+ gr.Dropdown(choices=["Hugging face dataset", "Proprietary dataset1"], value="Fine-Tuned Model", label="Select Model"),
169
  gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max Tokens"),
170
  gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
171
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
172
  ],
 
 
173
  )
174
 
175
  if __name__ == "__main__":
176
+ demo.launch()
177
+
178
+
179
+ # # Test the chatbot
180
+ # if __name__ == "__main__":
181
+ # while True:
182
+ # query = input("User: ")
183
+ # if query.lower() in ["exit", "quit"]:
184
+ # break
185
+ # response = chat(query)
186
+ # print(f"Bot: {response}")