Spaces:
Sleeping
Sleeping
import os | |
import re | |
import json | |
import time | |
import requests | |
import wandb | |
import torch | |
import spaces | |
from tqdm.auto import tqdm | |
import psutil | |
import pymupdf | |
import gradio as gr | |
from qdrant_client import QdrantClient | |
from utils import download_pdf_from_gdrive, merge_strings_with_prefix | |
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig | |
def rag_query(query: str): | |
""" | |
Allows searching the vector database which contains | |
information for a man named Suvaditya for a given query | |
by performing semantic search. Returns results by | |
looking at his resume, which contains a plethora of | |
information about him. | |
Args: | |
query: The query against which the search will be run, | |
in the form a single string phrase no more than | |
10 words. | |
Returns: | |
search_results: A list of results that come closest | |
to the given query semantically, | |
determined by Cosine Similarity. | |
""" | |
return client.query( | |
collection_name="resume", | |
query_text=query | |
) | |
def generate_answer(chat_history): | |
# Generate result | |
tool_prompt = tokenizer.apply_chat_template( | |
chat_history, | |
tools=[rag_query], | |
return_tensors="pt", | |
return_dict=True, | |
add_generation_prompt=True, | |
) | |
tool_prompt = tool_prompt.to(model.device) | |
out = model.generate( | |
**tool_prompt, | |
max_new_tokens=512, | |
do_sample=True, | |
top_p=0.95, | |
num_beams=4 | |
) | |
generated_text = out[0, tool_prompt['input_ids'].shape[1]:] | |
generated_text = tokenizer.decode(generated_text) | |
torch.cuda.empty_cache() | |
return generated_text | |
def parse_tool_request(tool_call, top_k=5): | |
pattern = r"<tool_call>(.*?)</tool_call>" | |
match_result = re.search(pattern, tool_call, re.DOTALL) | |
if match_result: | |
result = match_result.group(1).strip() | |
else: | |
return None, None | |
query = json.loads(result)["arguments"]["query"] | |
query_results = [ | |
query_piece.metadata["document"] for query_piece in rag_query(query) | |
] | |
return query_results[:top_k], query | |
def update_chat_history(chat_history, tool_query, query_results): | |
assistant_tool_message = { | |
"role": "assistant", | |
"metadata": "🛠️ Using Qdrant Engine to search for the query 🛠️", | |
"tool_calls": [{ | |
"type": "function", | |
"function": { | |
"name": "rag_query", | |
"arguments": {"query": f"{tool_query}"} | |
} | |
}] | |
} | |
result_tool_message = { | |
"role": "tool", | |
"name": "rag_query", | |
"content": "\n".join(query_results) | |
} | |
chat_history.append(assistant_tool_message) | |
chat_history.append(result_tool_message) | |
return chat_history | |
if __name__ == "__main__": | |
RESUME_DATA = """ | |
Suvaditya Mukherjee Email: [email protected] | |
Portfolio: suvadityamuk.com Mobile: (213) 827-9733 | |
Github: github.com/suvadityamuk | |
Education | |
University of Southern California Master of Science - Computer Science (Artificial Intelligence); GPA: 3.85/4 - Los Angeles, CA, USA | |
August 2024 - July 2026 | |
Courses: Machine Learning, Deep Learning, Advanced Computer Vision, Analysis of Algorithms | |
NMIMS Mukesh Patel School of Technology, Management and Engineering | |
Bachelor of Technology - Computer Science (Artificial Intelligence); GPA: 3.94/4 - Mumbai, India | |
August 2020 - May 2024 | |
Courses: Deep Learning, Data Structures and Algorithms, Machine Learning, Natural Language Processing, Software Engineering, | |
Operating Systems, Mathematics, Computer Organization and Architecture, Computer Networks, Database Management Systems | |
Experience | |
USC Institute of Creative Technologies Los Angeles, CA, USA | |
Machine Learning Student Worker - Learning Sciences Lab (Part-time) September 2024 - Present | |
Course Generation using Generative AI: Leverage Generative AI with LangChain and OpenAI to help make novel | |
techniques for course generation, tutoring content generation, and OpenTutor courses to learn and teach AI for the | |
AIRCOEE program in collaboration with the US Department of Defense, under Prof. (Dr.) Benjamin Nye. | |
Cogeneration Testbed: Maintain technologies for co-generation of tutoring content using open and cloud-based LLMs | |
to help educators. | |
USC School of Cinematic Arts Los Angeles, CA, USA | |
Machine Learning Assistant - Interactive Games Division (Part-time) September 2024 - Present | |
Student Worker: Assist Prof. (Dr.) Mark Bolas to develop an introductory Python Programming course for Game | |
Developers. | |
ML Research: Find new approaches to apply Generative AI based on LLMs and Diffusion Models to solve problems at | |
large-scale in Creative Media, with solutions such as generating scripts and summaries based on videos. | |
HARMAN International Bengaluru, India | |
Machine Learning Intern (Full-time) December 2023 - May 2024 | |
K-Shot Rotation-Invariant Object Detection Pipeline Development: Produced new Intellectual Property | |
towards achieving a robust pipeline to perform K-shot object detection without dependence on rotation alignment. | |
Improved pipeline with 35\% better results on client data | |
Zero-shot Time-Series Forecasting with LLMs: Researched on how to achieve zero-shot time-series forecasting | |
through LLMs while building on previous developments. | |
Spot Instance Handler using Agentic LLMs: Built an agent-based LLM system on Gemini 1.5 Pro and LangChain | |
to help reduce costs by 10\% incurred, by running non-critical workloads on spot-instances | |
Center for Visual Information Technology, IIIT-Hyderabad Hyderabad, India | |
Research Intern (Full-time) June 2023 - November 2023 | |
Research: Contributed towards research along Domain Adaptation problems in Autonomous Driving under Prof. C.V. | |
Jawahar and Prof. Shankar Gangisetty | |
Code Implementations: Operated with internal tools to execute large-scale GPU training and experimentation on | |
Image Segmentation problems | |
UnifyAI (Ivy) London, United Kingdom | |
ML Research Engineer Intern (Full-time) January 2023 - July 2023 | |
Demos and Examples: Developed new demos, examples, and guides to internal and external official documentation, | |
most notably around converting torchvision models into TFLite. Also helped in establishing programs and managing the | |
Google Summer of Code program as an Organization Admin | |
Internal AI Developer: Prototyped an AI Developer (Code-LLM) to automate and builds upon existing codebases and | |
speeds up internal development, along with handling self-training through Cloud resources such as GCP and AWS | |
Publications and Research | |
Presentation: Pushing the Performance Envelope : An Optimization Study for 3D Generative Modelling with | |
PyTorch: Work on finding techniques to optimize 3D Text-to-Image Mesh generation [Accepted at PyTorch Conference 2024] | |
Paper: Guiding the Student\’s Learning Curve: Augmenting Knowledge Distillation with Insights from | |
GradCAM: Work on investigating the effects of using GradCAM representations of Teacher models as direct inputs to | |
Student models for quicker convergence. [Accepted] | |
Paper: Project Lingua Franca: Democratizing Information through Unified Optical Character Recognition | |
and Neural Machine Translation: Work on combined Optical Character Recognition and Neural Machine Translation for | |
information translation with high-impact languages as targets [Accepted] | |
Leadership | |
Google Developer Expert: Recognized and selected as a top contributor to the Google ML Developer Community. Work | |
towards creating detailed tutorials, delivering talks around Deep Learning, and helping beta-test new products on GCP Vertex | |
AI and Gemini suite of tools. | |
Google Summer of Code: (Org Admin and Mentor) Mentored incoming students for completing tasks, handled | |
communications with Google Open Source Programs Office for compliance. | |
""" | |
# RESUME_PATH = os.path.join(os.getcwd(), "Resume.pdf") | |
# RESUME_URL = "https://drive.google.com/file/d/1YMF9NNTG5gubwJ7ipI5JfxAJKhlD9h2v/" | |
# ONNX_MODEL_PATH = "https://huggingface.co/onnx-community/Qwen2.5-1.5B-Instruct/resolve/main/onnx/model.onnx_data" | |
# SAVE_PATH = "./model.onnx_data" | |
# print("Downloading ONNX model...") | |
# response = requests.get(ONNX_MODEL_PATH, stream=True) | |
# response.raise_for_status() | |
# total_size = int(response.headers.get('content-length', 0)) | |
# with open(SAVE_PATH, 'wb') as file, tqdm( | |
# desc=os.path.basename(SAVE_PATH), | |
# total=total_size, | |
# unit='iB', | |
# unit_scale=True | |
# ) as pbar: | |
# for data in response.iter_content(chunk_size=8192): | |
# size = file.write(data) | |
# pbar.update(size) | |
# print("Downloaded ONNX model!") | |
# Download file | |
# download_pdf_from_gdrive(RESUME_URL, RESUME_PATH) | |
# doc = pymupdf.open(RESUME_PATH) | |
# fulltext = doc[0].get_text().split("\n") | |
# fulltext = merge_strings_with_prefix(fulltext) | |
fulltext = RESUME_DATA.split("\n\n") | |
print(fulltext) | |
# Embed the sentences | |
# client = QdrantClient(":memory:", optimize_for_ram_usage=True) | |
client = QdrantClient(":memory:") | |
client.set_model("sentence-transformers/all-MiniLM-L6-v2") | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
if not client.collection_exists(collection_name="resume"): | |
client.create_collection( | |
collection_name="resume", | |
vectors_config=client.get_fastembed_vector_params(), | |
) | |
_ = client.add( | |
collection_name="resume", | |
documents=fulltext, | |
ids=range(len(fulltext)), | |
batch_size=100, | |
# parallel=0, | |
) | |
# wandb.login( | |
# key=os.getenv("WANDB_API_KEY") | |
# ) | |
model_name = "Qwen/Qwen2.5-3B-Instruct" | |
# wandb.init( | |
# project="resume-rag", | |
# name="zerogpu-run", | |
# save_code=True, | |
# config={ | |
# "model_name": model_name, | |
# "resume_url": RESUME_URL | |
# } | |
# ) | |
# wandb.login( | |
# key=os.getenv("WANDB_API_KEY") | |
# ) | |
def rag_process(message, chat_history): | |
if not chat_history: | |
system_message = { | |
"role": "system", | |
"content": """You are an AI assistant focused on answering questions about Suvaditya's resume. | |
Only provide information that is explicitly mentioned in the resume data. | |
If you're unsure about any information, refuse to answer and direct users to suvadityamuk.com. | |
Be accurate and concise in your responses. """ | |
} | |
chat_history = [system_message] | |
# wandb.init( | |
# project="resume-rag", | |
# name="zerogpu-run", | |
# save_code=True, | |
# config={ | |
# "model_name": model_name, | |
# "resume_url": RESUME_URL | |
# } | |
# ) | |
# Append current user message to chat history | |
current_message = { | |
"role": "user", | |
"content": message | |
} | |
chat_history.append(current_message) | |
# start_time = time.time() | |
# Generate LLM answer | |
generated_text = generate_answer(chat_history) | |
# generated_text = onnx_inference(chat_history, rag_query, tokenizer) | |
# Detect if tool call is requested by LLM. If yes, then | |
# execute tool and use else return None | |
query_results, tool_query = parse_tool_request(generated_text) | |
# If tool call was requested | |
if query_results is not None and tool_query is not None: | |
# Update chat history with result of tool call | |
chat_history = update_chat_history( | |
chat_history, tool_query, query_results | |
) | |
# Generate result from the | |
generated_text = generate_answer(chat_history) | |
# generated_text = onnx_inference(chat_history, rag_query, tokenizer) | |
# metrics = { | |
# "conversation": { | |
# "turn": len(chat_history) // 2, | |
# "history": chat_history, | |
# "current_question": message, | |
# "current_answer": generated_text[:-10], | |
# "tool_query": tool_query, | |
# "rag_results": query_results | |
# }, | |
# "performance": { | |
# "response_time": time.time() - start_time, | |
# "gpu_memory_used": torch.cuda.memory_allocated() if torch.cuda.is_available() else 0, | |
# "cpu_memory": psutil.Process().memory_info().rss, | |
# # "gpu_utilization": torch.cuda.utilization() if torch.cuda.is_available() else 0 | |
# } | |
# } | |
# wandb.log(metrics) | |
return generated_text[:-10] | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
torch_dtype=torch.bfloat16, | |
device_map="auto", | |
# quantization_config=QuantoConfig( | |
# weights="int8", | |
# ) | |
# quantization_config = BitsAndBytesConfig( | |
# load_in_8bit=True, | |
# # bnb_4bit_compute_dtype=torch.float16, | |
# # bnb_4bit_quant_type="nf4" | |
# ) | |
) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
demo = gr.ChatInterface( | |
fn=rag_process, | |
type="messages", | |
title="Suvaditya's Personal RAG, a space on ZeroGPU!", | |
examples=["Where did Suvaditya complete his Bachelor's Degree?", "Where is Suvaditya currently working?"], | |
description="Ask any question about Suvaditya's resume and get an answer! \n\nNote: Sometimes, as always, the LLM may give wrong answers. Here's a link to my [resume](https://suvadityamuk.com/uploads/resume.pdf), if you'd like to go through it yourself! Get in touch with me through [X](https://x.com/halcyonrayes), [Gmail](mailto:[email protected]), [LinkedIn](https://www.linkedin.com/in/suvadityamukherjee), or [schedule a meeting with me here](https://cal.com/suvadityamuk)", | |
theme="John6666/YntecDark", | |
) | |
demo.launch() | |
# wandb.finish() |