bobpopboom commited on
Commit
00c98e3
·
verified ·
1 Parent(s): 0b2dc4c

ik botyy fix it

Browse files
Files changed (1) hide show
  1. app.py +12 -41
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
 
5
  if torch.cuda.is_available():
@@ -12,70 +12,41 @@ model_id = "thrishala/mental_health_chatbot"
12
  try:
13
  model = AutoModelForCausalLM.from_pretrained(
14
  model_id,
15
- device_map="cpu",
16
  torch_dtype=torch.float16,
17
  low_cpu_mem_usage=True,
18
- max_memory={"cpu": "15GB"},
19
  offload_folder="offload",
20
  )
21
- model.to(device)
22
  tokenizer = AutoTokenizer.from_pretrained(model_id)
23
  tokenizer.model_max_length = 512 # Set maximum length
24
- # ok this is just to slow with pipe i wish it was faster. Si were ren=moving pipe in favor of local model
25
-
26
- # pipe = pipeline(
27
- # "text-generation",
28
- # model=model,
29
- # tokenizer=tokenizer,
30
- # torch_dtype=torch.float16,
31
- # num_return_sequences=1,
32
- # do_sample=False,
33
- # truncation=True,
34
- # max_new_tokens=128
35
- # )
36
 
37
  except Exception as e:
38
  print(f"Error loading model: {e}")
39
  exit()
 
40
  def generate_text(prompt, max_new_tokens=128):
41
- input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device) #Move input to the same device as the model
42
 
43
- with torch.no_grad(): #Disable gradients during inference
44
  output = model.generate(
45
  input_ids=input_ids,
46
  max_new_tokens=max_new_tokens,
47
  do_sample=False, # Or True for sampling
48
- eos_token_id=tokenizer.eos_token_id, # Use EOS token to stop generation
49
- )[0]["generated_text"]
50
-
51
- # Extract only the new assistant response after the last Assistant: in the prompt
52
- bot_response = response[len(prompt):].split("User:")[0].strip() # Take text after prompt and before next User
53
  )
54
 
55
  generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
56
  return generated_text
57
- def respond(
58
- message,
59
- history,
60
- system_message,
61
- max_tokens,
62
- ):
63
- # Construct the prompt with clear separation
64
  prompt = f"{system_message}\n"
65
  for user_msg, bot_msg in history:
66
  prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n"
67
  prompt += f"User: {message}\nAssistant:"
68
-
69
  try:
70
- # response = pipe(
71
- # prompt,
72
- # max_new_tokens=max_tokens,
73
- # do_sample=False,
74
- # eos_token_id=tokenizer.eos_token_id, # Use EOS token to stop generation
75
- # )[0]["generated_text"]
76
-
77
- # Extract only the new assistant response after the last Assistant: in the prompt
78
- bot_response = generate_text(prompt, max_tokens)
79
  yield bot_response
80
  except Exception as e:
81
  print(f"Error during generation: {e}")
@@ -88,7 +59,7 @@ demo = gr.ChatInterface(
88
  value="You are a friendly and helpful mental health chatbot.",
89
  label="System message",
90
  ),
91
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
92
  ],
93
  )
94
 
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
 
5
  if torch.cuda.is_available():
 
12
  try:
13
  model = AutoModelForCausalLM.from_pretrained(
14
  model_id,
15
+ device_map=device, # Use the determined device
16
  torch_dtype=torch.float16,
17
  low_cpu_mem_usage=True,
18
+ max_memory={device: "15GB"}, # Use device-specific memory management
19
  offload_folder="offload",
20
  )
 
21
  tokenizer = AutoTokenizer.from_pretrained(model_id)
22
  tokenizer.model_max_length = 512 # Set maximum length
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  except Exception as e:
25
  print(f"Error loading model: {e}")
26
  exit()
27
+
28
  def generate_text(prompt, max_new_tokens=128):
29
+ input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
30
 
31
+ with torch.no_grad():
32
  output = model.generate(
33
  input_ids=input_ids,
34
  max_new_tokens=max_new_tokens,
35
  do_sample=False, # Or True for sampling
36
+ eos_token_id=tokenizer.eos_token_id,
 
 
 
 
37
  )
38
 
39
  generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
40
  return generated_text
41
+
42
+ def respond(message, history, system_message, max_tokens):
 
 
 
 
 
43
  prompt = f"{system_message}\n"
44
  for user_msg, bot_msg in history:
45
  prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n"
46
  prompt += f"User: {message}\nAssistant:"
47
+
48
  try:
49
+ bot_response = generate_text(prompt, max_tokens) # Use the new function
 
 
 
 
 
 
 
 
50
  yield bot_response
51
  except Exception as e:
52
  print(f"Error during generation: {e}")
 
59
  value="You are a friendly and helpful mental health chatbot.",
60
  label="System message",
61
  ),
62
+ gr.Slider(minimum=1, maximum=128, value=128, step=10, label="Max new tokens"),
63
  ],
64
  )
65