limitedonly41 commited on
Commit
b2aa395
·
verified ·
1 Parent(s): 0343456

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -10
app.py CHANGED
@@ -10,8 +10,6 @@ from unsloth import FastLanguageModel
10
  import torch
11
  import re
12
 
13
-
14
-
15
  # Define helper functions
16
  async def fetch_data(url):
17
  headers = {
@@ -102,14 +100,13 @@ def translate_text(text):
102
  print(f"An error occurred during translation: {e}")
103
  return None
104
 
105
- @spaces.GPU()
106
- def summarize_url(url):
107
-
108
- # Load the model
109
  max_seq_length = 2048
110
  dtype = None
111
  load_in_4bit = True
112
-
 
 
113
  model, tokenizer = FastLanguageModel.from_pretrained(
114
  model_name="unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
115
  max_seq_length=max_seq_length,
@@ -117,9 +114,15 @@ def summarize_url(url):
117
  load_in_4bit=load_in_4bit,
118
  )
119
 
120
- # Enable native 2x faster inference
121
- FastLanguageModel.for_inference(model)
 
122
 
 
 
 
 
 
123
  result = asyncio.run(fetch_data(url))
124
  text = concatenate_text(result)
125
  translated_text = translate_text(text)
@@ -136,7 +139,7 @@ def summarize_url(url):
136
  """
137
 
138
  prompt = alpaca_prompt.format(translated_text)
139
- inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
140
 
141
  outputs = model.generate(inputs.input_ids, max_new_tokens=64, use_cache=True)
142
  summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
10
  import torch
11
  import re
12
 
 
 
13
  # Define helper functions
14
  async def fetch_data(url):
15
  headers = {
 
100
  print(f"An error occurred during translation: {e}")
101
  return None
102
 
103
+ def load_model():
 
 
 
104
  max_seq_length = 2048
105
  dtype = None
106
  load_in_4bit = True
107
+
108
+ device = "cuda" if torch.cuda.is_available() else "cpu"
109
+
110
  model, tokenizer = FastLanguageModel.from_pretrained(
111
  model_name="unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
112
  max_seq_length=max_seq_length,
 
114
  load_in_4bit=load_in_4bit,
115
  )
116
 
117
+ # Enable native 2x faster inference if GPU is available
118
+ if device == "cuda":
119
+ FastLanguageModel.for_inference(model)
120
 
121
+ return model, tokenizer, device
122
+
123
+ model, tokenizer, device = load_model()
124
+
125
+ def summarize_url(url):
126
  result = asyncio.run(fetch_data(url))
127
  text = concatenate_text(result)
128
  translated_text = translate_text(text)
 
139
  """
140
 
141
  prompt = alpaca_prompt.format(translated_text)
142
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
143
 
144
  outputs = model.generate(inputs.input_ids, max_new_tokens=64, use_cache=True)
145
  summary = tokenizer.decode(outputs[0], skip_special_tokens=True)