Spaces:

suvadityamuk
/

resume-rag

Sleeping

App Files Files Community

suvadityamuk commited on Feb 4

Commit

272fd5b

1 Parent(s): bd59709

chore: made optims

Browse files

Signed-off-by: Suvaditya Mukherjee <[email protected]>

Files changed (2) hide show

app.py +39 -5
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import os
 import re
 import json
 import torch
 import spaces
 import pymupdf
 import gradio as gr
 from qdrant_client import QdrantClient
@@ -42,9 +45,16 @@ def generate_answer(chat_history):
         add_generation_prompt=True,
     )
     tool_prompt = tool_prompt.to(model.device)
-    out = model.generate(**tool_prompt, max_new_tokens=512)
     generated_text = out[0, tool_prompt['input_ids'].shape[1]:]
     generated_text = tokenizer.decode(generated_text)
     return generated_text
 def parse_tool_request(tool_call, top_k=5):
@@ -97,7 +107,7 @@ if __name__ == "__main__":
     fulltext = merge_strings_with_prefix(fulltext)
     # Embed the sentences
-    client = QdrantClient(":memory:")
     client.set_model("sentence-transformers/all-MiniLM-L6-v2")
@@ -115,7 +125,7 @@ if __name__ == "__main__":
         parallel=0,
     )
-    # FOR QWEN, THIS IS WORKING
     model_name = "Qwen/Qwen2.5-3B-Instruct"
@@ -128,6 +138,7 @@ if __name__ == "__main__":
         }
         chat_history.append(current_message)
         # Generate LLM answer
         generated_text = generate_answer(chat_history)
@@ -137,7 +148,6 @@ if __name__ == "__main__":
         # If tool call was requested
         if query_results is not None and tool_query is not None:
-            print("Inside")
             # Update chat history with result of tool call
             chat_history = update_chat_history(
                 chat_history, tool_query, query_results
@@ -145,13 +155,37 @@ if __name__ == "__main__":
             # Generate result from the
             generated_text = generate_answer(chat_history)
         return generated_text[:-10]
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         torch_dtype=torch.bfloat16,
         device_map="auto",
-        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
     )
     tokenizer = AutoTokenizer.from_pretrained(model_name)

 import os
 import re
 import json
+import time
+import wandb
 import torch
 import spaces
+import psutil
 import pymupdf
 import gradio as gr
 from qdrant_client import QdrantClient
         add_generation_prompt=True,
     )
     tool_prompt = tool_prompt.to(model.device)
+    out = model.generate(
+        **tool_prompt,
+        max_new_tokens=512,
+        do_sample=True,
+        top_p=0.95,
+        num_beams=4
+    )
     generated_text = out[0, tool_prompt['input_ids'].shape[1]:]
     generated_text = tokenizer.decode(generated_text)
+    torch.cuda.empty_cache()
     return generated_text
 def parse_tool_request(tool_call, top_k=5):
     fulltext = merge_strings_with_prefix(fulltext)
     # Embed the sentences
+    client = QdrantClient(":memory:", optimize_for_ram_usage=True)
     client.set_model("sentence-transformers/all-MiniLM-L6-v2")
         parallel=0,
     )
+    wandb.init(project="resume-rag", name="zerogpu-run")
     model_name = "Qwen/Qwen2.5-3B-Instruct"
         }
         chat_history.append(current_message)
+        start_time = time.time()
         # Generate LLM answer
         generated_text = generate_answer(chat_history)
         # If tool call was requested
         if query_results is not None and tool_query is not None:
             # Update chat history with result of tool call
             chat_history = update_chat_history(
                 chat_history, tool_query, query_results
             # Generate result from the
             generated_text = generate_answer(chat_history)
+        metrics = {
+            "conversation": {
+                "turn": len(chat_history) // 2,
+                "history": chat_history,
+                "current_question": message,
+                "current_answer": generated_text[:-10],
+                "tool_query": tool_query,
+                "rag_results": query_results
+            },
+            "performance": {
+                "response_time": time.time() - start_time,
+                "gpu_memory_used": torch.cuda.memory_allocated() if torch.cuda.is_available() else 0,
+                "cpu_memory": psutil.Process().memory_info().rss,
+                "gpu_utilization": torch.cuda.utilization() if torch.cuda.is_available() else 0
+            }
+        }
+        wandb.log(metrics)
+        wandb.finish()
         return generated_text[:-10]
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         torch_dtype=torch.bfloat16,
         device_map="auto",
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_quant_type="nf4"
+        )
     )
     tokenizer = AutoTokenizer.from_pretrained(model_name)

requirements.txt CHANGED Viewed

@@ -9,4 +9,6 @@ torchvision
 torchaudio
 accelerate
 bitsandbytes
-optimum

 torchaudio
 accelerate
 bitsandbytes
+optimum
+wandb
+psutil