prithivMLmods commited on
Commit
d5fdad9
·
verified ·
1 Parent(s): f948054

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -109
app.py CHANGED
@@ -4,6 +4,7 @@ import uuid
4
  import json
5
  import time
6
  import asyncio
 
7
  from threading import Thread
8
 
9
  import gradio as gr
@@ -12,6 +13,7 @@ import torch
12
  import numpy as np
13
  from PIL import Image
14
  import edge_tts
 
15
 
16
  from transformers import (
17
  AutoModelForCausalLM,
@@ -21,14 +23,75 @@ from transformers import (
21
  AutoProcessor,
22
  )
23
  from transformers.image_utils import load_image
24
- from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
25
 
26
- # Additional imports for 3D model generation
27
- import tempfile
28
- import trimesh
29
  from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
30
  from diffusers.utils import export_to_ply
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  DESCRIPTION = """
33
  # QwQ Edge 💬
34
  """
@@ -50,13 +113,14 @@ h1 {
50
  MAX_MAX_NEW_TOKENS = 2048
51
  DEFAULT_MAX_NEW_TOKENS = 1024
52
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
53
- MAX_SEED = np.iinfo(np.int32).max
54
 
55
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
56
 
57
- # ------------------------------
58
- # Text Generation Model
59
- # ------------------------------
 
 
60
  model_id = "prithivMLmods/FastThink-0.5B-Tiny"
61
  tokenizer = AutoTokenizer.from_pretrained(model_id)
62
  model = AutoModelForCausalLM.from_pretrained(
@@ -66,11 +130,13 @@ model = AutoModelForCausalLM.from_pretrained(
66
  )
67
  model.eval()
68
 
 
69
  TTS_VOICES = [
70
  "en-US-JennyNeural", # @tts1
71
  "en-US-GuyNeural", # @tts2
72
  ]
73
 
 
74
  MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
75
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
76
  model_m = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -79,12 +145,20 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
79
  torch_dtype=torch.float16
80
  ).to("cuda").eval()
81
 
 
 
 
 
82
  async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
83
  """Convert text to speech using Edge TTS and save as MP3"""
84
  communicate = edge_tts.Communicate(text, voice)
85
  await communicate.save(output_file)
86
  return output_file
87
 
 
 
 
 
88
  def clean_chat_history(chat_history):
89
  """
90
  Filter out any chat entries whose "content" is not a string.
@@ -96,9 +170,10 @@ def clean_chat_history(chat_history):
96
  cleaned.append(msg)
97
  return cleaned
98
 
99
- # ------------------------------
100
- # Stable Diffusion XL (Image Generation)
101
- # ------------------------------
 
102
  MODEL_ID_SD = os.getenv("MODEL_VAL_PATH") # SDXL Model repository path via env variable
103
  MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
104
  USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
@@ -128,11 +203,6 @@ def save_image(img: Image.Image) -> str:
128
  img.save(unique_name)
129
  return unique_name
130
 
131
- def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
132
- if randomize_seed:
133
- seed = random.randint(0, MAX_SEED)
134
- return seed
135
-
136
  @spaces.GPU(duration=60, enable_queue=True)
137
  def generate_image_fn(
138
  prompt: str,
@@ -166,6 +236,7 @@ def generate_image_fn(
166
  options["use_resolution_binning"] = True
167
 
168
  images = []
 
169
  for i in range(0, num_images, BATCH_SIZE):
170
  batch_options = options.copy()
171
  batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
@@ -180,82 +251,30 @@ def generate_image_fn(
180
  image_paths = [save_image(img) for img in images]
181
  return image_paths, seed
182
 
183
- # ------------------------------
184
- # 3D Model Generation using ShapE (Text-to-3D / Image-to-3D)
185
- # ------------------------------
186
- class Model3D:
187
- def __init__(self):
188
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
189
- self.pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16)
190
- self.pipe.to(self.device)
191
- # Ensure the text encoder is in half precision
192
- self.pipe.text_encoder = self.pipe.text_encoder.half()
193
-
194
- self.pipe_img = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img", torch_dtype=torch.float16)
195
- self.pipe_img.to(self.device)
196
- # Ensure the text encoder is in half precision
197
- self.pipe_img.text_encoder = self.pipe_img.text_encoder.half()
198
-
199
- def to_glb(self, ply_path: str) -> str:
200
- mesh = trimesh.load(ply_path)
201
- rot = trimesh.transformations.rotation_matrix(-np.pi / 2, [1, 0, 0])
202
- mesh.apply_transform(rot)
203
- rot = trimesh.transformations.rotation_matrix(np.pi, [0, 1, 0])
204
- mesh.apply_transform(rot)
205
- mesh_path = tempfile.NamedTemporaryFile(suffix=".glb", delete=False)
206
- mesh.export(mesh_path.name, file_type="glb")
207
- return mesh_path.name
208
 
209
- def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
210
- generator = torch.Generator(device=self.device).manual_seed(seed)
211
- if self.device.type == "cuda":
212
- with torch.autocast("cuda", dtype=torch.float16):
213
- output = self.pipe(
214
- prompt,
215
- generator=generator,
216
- guidance_scale=guidance_scale,
217
- num_inference_steps=num_steps,
218
- output_type="mesh",
219
- )
220
- else:
221
- output = self.pipe(
222
- prompt,
223
- generator=generator,
224
- guidance_scale=guidance_scale,
225
- num_inference_steps=num_steps,
226
- output_type="mesh",
227
- )
228
- images = output.images
229
- ply_path = tempfile.NamedTemporaryFile(suffix=".ply", delete=False, mode="w+b")
230
- export_to_ply(images[0], ply_path.name)
231
- return self.to_glb(ply_path.name)
232
-
233
- def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
234
- generator = torch.Generator(device=self.device).manual_seed(seed)
235
- if self.device.type == "cuda":
236
- with torch.autocast("cuda", dtype=torch.float16):
237
- output = self.pipe_img(
238
- image,
239
- generator=generator,
240
- guidance_scale=guidance_scale,
241
- num_inference_steps=num_steps,
242
- output_type="mesh",
243
- )
244
- else:
245
- output = self.pipe_img(
246
- image,
247
- generator=generator,
248
- guidance_scale=guidance_scale,
249
- num_inference_steps=num_steps,
250
- output_type="mesh",
251
- )
252
- images = output.images
253
- ply_path = tempfile.NamedTemporaryFile(suffix=".ply", delete=False, mode="w+b")
254
- export_to_ply(images[0], ply_path.name)
255
- return self.to_glb(ply_path.name)
256
 
257
- # Create a global instance of the 3D model generator.
258
- model_3d = Model3D()
 
259
 
260
  @spaces.GPU
261
  def generate(
@@ -269,7 +288,7 @@ def generate(
269
  ):
270
  """
271
  Generates chatbot responses with support for multimodal input, TTS, image generation,
272
- and 3D model generation.
273
 
274
  Special commands:
275
  - "@tts1" or "@tts2": triggers text-to-speech.
@@ -279,24 +298,21 @@ def generate(
279
  text = input_dict["text"]
280
  files = input_dict.get("files", [])
281
 
282
- # ------------------------------
283
- # 3D Model Generation Command
284
- # ------------------------------
285
  if text.strip().lower().startswith("@3d"):
286
- text = text[len("@3d"):].strip()
287
  yield "Generating 3D model..."
288
- seed = random.randint(0, MAX_SEED)
289
- if files:
290
- image = load_image(files[0])
291
- glb_file = model_3d.run_image(image, seed=seed)
292
- else:
293
- glb_file = model_3d.run_text(text, seed=seed)
294
- yield gr.File(glb_file)
 
295
  return
296
 
297
- # ------------------------------
298
- # Image Generation Command
299
- # ------------------------------
300
  if text.strip().lower().startswith("@image"):
301
  prompt = text[len("@image"):].strip()
302
  yield "Generating image..."
@@ -316,9 +332,7 @@ def generate(
316
  yield gr.Image(image_paths[0])
317
  return
318
 
319
- # ------------------------------
320
- # TTS / Regular Text Generation
321
- # ------------------------------
322
  tts_prefix = "@tts"
323
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
324
  voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
@@ -394,6 +408,10 @@ def generate(
394
  output_file = asyncio.run(text_to_speech(final_response, voice))
395
  yield gr.Audio(output_file, autoplay=True)
396
 
 
 
 
 
397
  demo = gr.ChatInterface(
398
  fn=generate,
399
  additional_inputs=[
@@ -408,9 +426,9 @@ demo = gr.ChatInterface(
408
  [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
409
  [{"text": "summarize the letter", "files": ["examples/1.png"]}],
410
  ["@image Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic"],
 
411
  ["Write a Python function to check if a number is prime."],
412
  ["@tts2 What causes rainbows to form?"],
413
- ["@3d A futuristic spaceship in low-poly style"],
414
  ],
415
  cache_examples=False,
416
  type="messages",
@@ -423,4 +441,5 @@ demo = gr.ChatInterface(
423
  )
424
 
425
  if __name__ == "__main__":
 
426
  demo.queue(max_size=20).launch(share=True)
 
4
  import json
5
  import time
6
  import asyncio
7
+ import tempfile
8
  from threading import Thread
9
 
10
  import gradio as gr
 
13
  import numpy as np
14
  from PIL import Image
15
  import edge_tts
16
+ import trimesh
17
 
18
  from transformers import (
19
  AutoModelForCausalLM,
 
23
  AutoProcessor,
24
  )
25
  from transformers.image_utils import load_image
 
26
 
27
+ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 
 
28
  from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
29
  from diffusers.utils import export_to_ply
30
 
31
+ # -----------------------------------------------------------------------------
32
+ # Global constants and helper functions
33
+ # -----------------------------------------------------------------------------
34
+
35
+ MAX_SEED = np.iinfo(np.int32).max
36
+
37
+ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
38
+ if randomize_seed:
39
+ seed = random.randint(0, MAX_SEED)
40
+ return seed
41
+
42
+ # -----------------------------------------------------------------------------
43
+ # Model class for Text-to-3D Generation (ShapE)
44
+ # -----------------------------------------------------------------------------
45
+
46
+ class Model:
47
+ def __init__(self):
48
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
49
+ self.pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16)
50
+ self.pipe.to(self.device)
51
+
52
+ self.pipe_img = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img", torch_dtype=torch.float16)
53
+ self.pipe_img.to(self.device)
54
+
55
+ def to_glb(self, ply_path: str) -> str:
56
+ mesh = trimesh.load(ply_path)
57
+ rot = trimesh.transformations.rotation_matrix(-np.pi / 2, [1, 0, 0])
58
+ mesh.apply_transform(rot)
59
+ rot = trimesh.transformations.rotation_matrix(np.pi, [0, 1, 0])
60
+ mesh.apply_transform(rot)
61
+ mesh_path = tempfile.NamedTemporaryFile(suffix=".glb", delete=False)
62
+ mesh.export(mesh_path.name, file_type="glb")
63
+ return mesh_path.name
64
+
65
+ def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
66
+ generator = torch.Generator(device=self.device).manual_seed(seed)
67
+ images = self.pipe(
68
+ prompt,
69
+ generator=generator,
70
+ guidance_scale=guidance_scale,
71
+ num_inference_steps=num_steps,
72
+ output_type="mesh",
73
+ ).images
74
+ ply_path = tempfile.NamedTemporaryFile(suffix=".ply", delete=False, mode="w+b")
75
+ export_to_ply(images[0], ply_path.name)
76
+ return self.to_glb(ply_path.name)
77
+
78
+ def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
79
+ generator = torch.Generator(device=self.device).manual_seed(seed)
80
+ images = self.pipe_img(
81
+ image,
82
+ generator=generator,
83
+ guidance_scale=guidance_scale,
84
+ num_inference_steps=num_steps,
85
+ output_type="mesh",
86
+ ).images
87
+ ply_path = tempfile.NamedTemporaryFile(suffix=".ply", delete=False, mode="w+b")
88
+ export_to_ply(images[0], ply_path.name)
89
+ return self.to_glb(ply_path.name)
90
+
91
+ # -----------------------------------------------------------------------------
92
+ # Gradio UI configuration
93
+ # -----------------------------------------------------------------------------
94
+
95
  DESCRIPTION = """
96
  # QwQ Edge 💬
97
  """
 
113
  MAX_MAX_NEW_TOKENS = 2048
114
  DEFAULT_MAX_NEW_TOKENS = 1024
115
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 
116
 
117
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
118
 
119
+ # -----------------------------------------------------------------------------
120
+ # Load Models and Pipelines for Chat, Image, and Multimodal Processing
121
+ # -----------------------------------------------------------------------------
122
+
123
+ # Load the text-only model and tokenizer (for pure text chat)
124
  model_id = "prithivMLmods/FastThink-0.5B-Tiny"
125
  tokenizer = AutoTokenizer.from_pretrained(model_id)
126
  model = AutoModelForCausalLM.from_pretrained(
 
130
  )
131
  model.eval()
132
 
133
+ # Voices for text-to-speech
134
  TTS_VOICES = [
135
  "en-US-JennyNeural", # @tts1
136
  "en-US-GuyNeural", # @tts2
137
  ]
138
 
139
+ # Load multimodal processor and model (e.g. for OCR and image processing)
140
  MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
141
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
142
  model_m = Qwen2VLForConditionalGeneration.from_pretrained(
 
145
  torch_dtype=torch.float16
146
  ).to("cuda").eval()
147
 
148
+ # -----------------------------------------------------------------------------
149
+ # Asynchronous text-to-speech
150
+ # -----------------------------------------------------------------------------
151
+
152
  async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
153
  """Convert text to speech using Edge TTS and save as MP3"""
154
  communicate = edge_tts.Communicate(text, voice)
155
  await communicate.save(output_file)
156
  return output_file
157
 
158
+ # -----------------------------------------------------------------------------
159
+ # Utility function to clean conversation history
160
+ # -----------------------------------------------------------------------------
161
+
162
  def clean_chat_history(chat_history):
163
  """
164
  Filter out any chat entries whose "content" is not a string.
 
170
  cleaned.append(msg)
171
  return cleaned
172
 
173
+ # -----------------------------------------------------------------------------
174
+ # Stable Diffusion XL Pipeline for Image Generation
175
+ # -----------------------------------------------------------------------------
176
+
177
  MODEL_ID_SD = os.getenv("MODEL_VAL_PATH") # SDXL Model repository path via env variable
178
  MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
179
  USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 
203
  img.save(unique_name)
204
  return unique_name
205
 
 
 
 
 
 
206
  @spaces.GPU(duration=60, enable_queue=True)
207
  def generate_image_fn(
208
  prompt: str,
 
236
  options["use_resolution_binning"] = True
237
 
238
  images = []
239
+ # Process in batches
240
  for i in range(0, num_images, BATCH_SIZE):
241
  batch_options = options.copy()
242
  batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
 
251
  image_paths = [save_image(img) for img in images]
252
  return image_paths, seed
253
 
254
+ # -----------------------------------------------------------------------------
255
+ # Text-to-3D Generation using the ShapE Pipeline
256
+ # -----------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
+ @spaces.GPU(duration=120, enable_queue=True)
259
+ def generate_3d_fn(
260
+ prompt: str,
261
+ seed: int = 1,
262
+ guidance_scale: float = 15.0,
263
+ num_steps: int = 64,
264
+ randomize_seed: bool = False,
265
+ ):
266
+ """
267
+ Generate a 3D model from text using the ShapE pipeline.
268
+ Returns a tuple of (glb_file_path, used_seed).
269
+ """
270
+ seed = int(randomize_seed_fn(seed, randomize_seed))
271
+ model3d = Model()
272
+ glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
273
+ return glb_path, seed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
+ # -----------------------------------------------------------------------------
276
+ # Chat Generation Function with support for @tts, @image, and @3d commands
277
+ # -----------------------------------------------------------------------------
278
 
279
  @spaces.GPU
280
  def generate(
 
288
  ):
289
  """
290
  Generates chatbot responses with support for multimodal input, TTS, image generation,
291
+ and now 3D generation.
292
 
293
  Special commands:
294
  - "@tts1" or "@tts2": triggers text-to-speech.
 
298
  text = input_dict["text"]
299
  files = input_dict.get("files", [])
300
 
301
+ # --- 3D Generation branch ---
 
 
302
  if text.strip().lower().startswith("@3d"):
303
+ prompt = text[len("@3d"):].strip()
304
  yield "Generating 3D model..."
305
+ glb_path, used_seed = generate_3d_fn(
306
+ prompt=prompt,
307
+ seed=1,
308
+ guidance_scale=15.0,
309
+ num_steps=64,
310
+ randomize_seed=True,
311
+ )
312
+ yield gr.File(glb_path, label="3D Model (GLB)")
313
  return
314
 
315
+ # --- Image Generation branch ---
 
 
316
  if text.strip().lower().startswith("@image"):
317
  prompt = text[len("@image"):].strip()
318
  yield "Generating image..."
 
332
  yield gr.Image(image_paths[0])
333
  return
334
 
335
+ # --- Text and TTS branch ---
 
 
336
  tts_prefix = "@tts"
337
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
338
  voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
 
408
  output_file = asyncio.run(text_to_speech(final_response, voice))
409
  yield gr.Audio(output_file, autoplay=True)
410
 
411
+ # -----------------------------------------------------------------------------
412
+ # Gradio Chat Interface Setup and Launch
413
+ # -----------------------------------------------------------------------------
414
+
415
  demo = gr.ChatInterface(
416
  fn=generate,
417
  additional_inputs=[
 
426
  [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
427
  [{"text": "summarize the letter", "files": ["examples/1.png"]}],
428
  ["@image Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic"],
429
+ ["@3d A futuristic city skyline in the style of cyberpunk"],
430
  ["Write a Python function to check if a number is prime."],
431
  ["@tts2 What causes rainbows to form?"],
 
432
  ],
433
  cache_examples=False,
434
  type="messages",
 
441
  )
442
 
443
  if __name__ == "__main__":
444
+ # To create a public link, set share=True in launch().
445
  demo.queue(max_size=20).launch(share=True)