File size: 14,403 Bytes
3dd4599
 
 
7d8ee8d
1514f66
7d8ee8d
3dd4599
 
a7ce579
7d8ee8d
3dd4599
d1567c0
3dd4599
01e09d8
6c2acf3
d1567c0
3dd4599
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272fd5b
 
 
 
 
 
 
3dd4599
 
272fd5b
3dd4599
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd59709
3dd4599
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1567c0
 
5b6170f
d2d1ae8
5b6170f
 
 
 
 
7394cfa
5b6170f
 
 
7394cfa
5b6170f
 
 
 
 
 
7394cfa
5b6170f
 
7394cfa
5b6170f
 
 
 
 
 
 
 
 
 
7394cfa
5b6170f
 
 
 
 
 
 
 
 
7394cfa
5b6170f
 
 
 
 
 
 
 
 
 
 
 
 
7394cfa
5b6170f
 
 
 
 
 
 
 
 
7394cfa
5b6170f
 
 
 
 
 
 
 
 
 
7394cfa
5b6170f
 
 
 
 
 
 
 
 
 
 
 
 
7394cfa
5b6170f
 
 
 
 
 
 
 
 
 
 
3dd4599
01e09d8
 
1514f66
01e09d8
 
 
ac790b7
01e09d8
ac790b7
01e09d8
 
 
 
 
 
 
 
 
 
1514f66
3dd4599
5b6170f
 
 
 
 
3dd4599
5b6170f
1383c28
7394cfa
6ab32e8
5b6170f
3dd4599
663028e
1a5794c
 
3dd4599
663028e
3dd4599
6ef46ff
 
663028e
 
3dd4599
663028e
3dd4599
 
663028e
 
 
 
 
17cd6ef
663028e
3dd4599
ab3a211
 
 
3dd4599
 
 
ab3a211
 
 
 
 
 
 
 
 
b6b4764
627e6af
 
 
8a45e12
3dd4599
 
d2d1ae8
 
 
 
 
 
 
 
 
8a45e12
 
 
 
 
 
 
 
 
3dd4599
 
 
 
 
 
 
8a45e12
3dd4599
01e09d8
 
3dd4599
 
 
 
 
 
 
 
 
 
 
 
01e09d8
 
3dd4599
8a45e12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d8ee8d
3dd4599
 
 
 
86000ea
3dd4599
86000ea
 
 
f033a76
 
 
 
 
72d37a9
3dd4599
 
 
 
 
 
90da7d1
5048f43
83f72c4
d2d1ae8
3dd4599
d2d1ae8
ab3a211
8a45e12
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
import os
import re
import json
import time
import requests
import wandb
import torch
import spaces
from tqdm.auto import tqdm
import psutil
import pymupdf
import gradio as gr
from qdrant_client import QdrantClient
from utils import download_pdf_from_gdrive, merge_strings_with_prefix
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig

def rag_query(query: str):
    """
    Allows searching the vector database which contains
    information for a man named Suvaditya for a given query
    by performing semantic search. Returns results by
    looking at his resume, which contains a plethora of
    information about him.

    Args:
        query: The query against which the search will be run,
               in the form a single string phrase no more than
               10 words.

    Returns:
        search_results: A list of results that come closest
                        to the given query semantically,
                        determined by Cosine Similarity.
    """
    return client.query(
        collection_name="resume",
        query_text=query
    )

def generate_answer(chat_history):
    # Generate result
    tool_prompt = tokenizer.apply_chat_template(
        chat_history,
        tools=[rag_query],
        return_tensors="pt",
        return_dict=True,
        add_generation_prompt=True,
    )
    tool_prompt = tool_prompt.to(model.device)
    out = model.generate(
        **tool_prompt, 
        max_new_tokens=512,
        do_sample=True,
        top_p=0.95,
        num_beams=4
    )
    generated_text = out[0, tool_prompt['input_ids'].shape[1]:]
    generated_text = tokenizer.decode(generated_text)
    torch.cuda.empty_cache()
    return generated_text

def parse_tool_request(tool_call, top_k=5):
    pattern = r"<tool_call>(.*?)</tool_call>"
    match_result = re.search(pattern, tool_call, re.DOTALL)
    if match_result:
        result = match_result.group(1).strip()
    else:
        return None, None

    query = json.loads(result)["arguments"]["query"]
    query_results = [
        query_piece.metadata["document"] for query_piece in rag_query(query)
    ]

    return query_results[:top_k], query

def update_chat_history(chat_history, tool_query, query_results):
    assistant_tool_message = {
        "role": "assistant",
        "metadata": "🛠️ Using Qdrant Engine to search for the query 🛠️",
        "tool_calls": [{
            "type": "function",
            "function": {
                "name": "rag_query",
                "arguments": {"query": f"{tool_query}"}
            }
        }]
    }
    result_tool_message = {
        "role": "tool",
        "name": "rag_query",
        "content": "\n".join(query_results)
    }

    chat_history.append(assistant_tool_message)
    chat_history.append(result_tool_message)

    return chat_history

if __name__ == "__main__":
    RESUME_DATA = """

    Suvaditya Mukherjee Email: suvadity@usc.edu
    Portfolio: suvadityamuk.com Mobile: (213) 827-9733
    Github: github.com/suvadityamuk

    Education
    University of Southern California Master of Science - Computer Science (Artificial Intelligence); GPA: 3.85/4 - Los Angeles, CA, USA
    August 2024 - July 2026
    Courses: Machine Learning, Deep Learning, Advanced Computer Vision, Analysis of Algorithms


    NMIMS Mukesh Patel School of Technology, Management and Engineering
    Bachelor of Technology - Computer Science (Artificial Intelligence); GPA: 3.94/4 - Mumbai, India
    August 2020 - May 2024
    Courses: Deep Learning, Data Structures and Algorithms, Machine Learning, Natural Language Processing, Software Engineering,
    Operating Systems, Mathematics, Computer Organization and Architecture, Computer Networks, Database Management Systems


    Experience


    USC Institute of Creative Technologies Los Angeles, CA, USA
    Machine Learning Student Worker - Learning Sciences Lab (Part-time) September 2024 - Present

    Course Generation using Generative AI: Leverage Generative AI with LangChain and OpenAI to help make novel
    techniques for course generation, tutoring content generation, and OpenTutor courses to learn and teach AI for the
    AIRCOEE program in collaboration with the US Department of Defense, under Prof. (Dr.) Benjamin Nye.

    Cogeneration Testbed: Maintain technologies for co-generation of tutoring content using open and cloud-based LLMs
    to help educators.

    
    USC School of Cinematic Arts Los Angeles, CA, USA
    Machine Learning Assistant - Interactive Games Division (Part-time) September 2024 - Present

    Student Worker: Assist Prof. (Dr.) Mark Bolas to develop an introductory Python Programming course for Game
    Developers.

    ML Research: Find new approaches to apply Generative AI based on LLMs and Diffusion Models to solve problems at
    large-scale in Creative Media, with solutions such as generating scripts and summaries based on videos.
    
    
    HARMAN International Bengaluru, India
    Machine Learning Intern (Full-time) December 2023 - May 2024

    K-Shot Rotation-Invariant Object Detection Pipeline Development: Produced new Intellectual Property
    towards achieving a robust pipeline to perform K-shot object detection without dependence on rotation alignment.
    Improved pipeline with 35\% better results on client data
    
    Zero-shot Time-Series Forecasting with LLMs: Researched on how to achieve zero-shot time-series forecasting
    through LLMs while building on previous developments.
    
    Spot Instance Handler using Agentic LLMs: Built an agent-based LLM system on Gemini 1.5 Pro and LangChain
    to help reduce costs by 10\% incurred, by running non-critical workloads on spot-instances
    
    
    Center for Visual Information Technology, IIIT-Hyderabad Hyderabad, India
    Research Intern (Full-time) June 2023 - November 2023
    
    Research: Contributed towards research along Domain Adaptation problems in Autonomous Driving under Prof. C.V.
    Jawahar and Prof. Shankar Gangisetty
    
    Code Implementations: Operated with internal tools to execute large-scale GPU training and experimentation on
    Image Segmentation problems
    
    
    UnifyAI (Ivy) London, United Kingdom
    ML Research Engineer Intern (Full-time) January 2023 - July 2023
    
    Demos and Examples: Developed new demos, examples, and guides to internal and external official documentation,
    most notably around converting torchvision models into TFLite. Also helped in establishing programs and managing the
    Google Summer of Code program as an Organization Admin
    
    Internal AI Developer: Prototyped an AI Developer (Code-LLM) to automate and builds upon existing codebases and
    speeds up internal development, along with handling self-training through Cloud resources such as GCP and AWS

    
    Publications and Research
    
    Presentation: Pushing the Performance Envelope : An Optimization Study for 3D Generative Modelling with
    PyTorch: Work on finding techniques to optimize 3D Text-to-Image Mesh generation [Accepted at PyTorch Conference 2024]
    
    Paper: Guiding the Student\’s Learning Curve: Augmenting Knowledge Distillation with Insights from
    GradCAM: Work on investigating the effects of using GradCAM representations of Teacher models as direct inputs to
    Student models for quicker convergence. [Accepted]
    
    Paper: Project Lingua Franca: Democratizing Information through Unified Optical Character Recognition
    and Neural Machine Translation: Work on combined Optical Character Recognition and Neural Machine Translation for
    information translation with high-impact languages as targets [Accepted]

    
    Leadership
    
    Google Developer Expert: Recognized and selected as a top contributor to the Google ML Developer Community. Work
    towards creating detailed tutorials, delivering talks around Deep Learning, and helping beta-test new products on GCP Vertex
    AI and Gemini suite of tools.
    
    Google Summer of Code: (Org Admin and Mentor) Mentored incoming students for completing tasks, handled
    communications with Google Open Source Programs Office for compliance.
    """
    # RESUME_PATH = os.path.join(os.getcwd(), "Resume.pdf")
    # RESUME_URL = "https://drive.google.com/file/d/1YMF9NNTG5gubwJ7ipI5JfxAJKhlD9h2v/"

    # ONNX_MODEL_PATH = "https://huggingface.co/onnx-community/Qwen2.5-1.5B-Instruct/resolve/main/onnx/model.onnx_data"
    # SAVE_PATH = "./model.onnx_data"

    # print("Downloading ONNX model...")
    # response = requests.get(ONNX_MODEL_PATH, stream=True)
    # response.raise_for_status()
    
    # total_size = int(response.headers.get('content-length', 0))
    
    # with open(SAVE_PATH, 'wb') as file, tqdm(
    #     desc=os.path.basename(SAVE_PATH),
    #     total=total_size,
    #     unit='iB',
    #     unit_scale=True
    # ) as pbar:
    #     for data in response.iter_content(chunk_size=8192):
    #         size = file.write(data)
    #         pbar.update(size)
    # print("Downloaded ONNX model!")

    # Download file
    # download_pdf_from_gdrive(RESUME_URL, RESUME_PATH)

    # doc = pymupdf.open(RESUME_PATH)

    # fulltext = doc[0].get_text().split("\n")

    # fulltext = merge_strings_with_prefix(fulltext)

    fulltext = RESUME_DATA.split("\n\n")

    print(fulltext)

    # Embed the sentences
    # client = QdrantClient(":memory:", optimize_for_ram_usage=True)
    client = QdrantClient(":memory:")

    client.set_model("sentence-transformers/all-MiniLM-L6-v2")

    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    if not client.collection_exists(collection_name="resume"):
        client.create_collection(
            collection_name="resume",
            vectors_config=client.get_fastembed_vector_params(),
        )

    _ = client.add(
        collection_name="resume",
        documents=fulltext,
        ids=range(len(fulltext)),
        batch_size=100,
        # parallel=0,
    )

    # wandb.login(
    #     key=os.getenv("WANDB_API_KEY")
    # )

    model_name = "Qwen/Qwen2.5-3B-Instruct"

    # wandb.init(
    #     project="resume-rag", 
    #     name="zerogpu-run",
    #     save_code=True,
    #     config={
    #         "model_name": model_name,
    #         "resume_url": RESUME_URL
    #     }
    # )

    # wandb.login(
    #     key=os.getenv("WANDB_API_KEY")
    # )

    @spaces.GPU
    def rag_process(message, chat_history):
        if not chat_history:
            system_message = {
                "role": "system",
                "content": """You are an AI assistant focused on answering questions about Suvaditya's resume. 
                Only provide information that is explicitly mentioned in the resume data. 
                If you're unsure about any information, refuse to answer and direct users to suvadityamuk.com. 
                Be accurate and concise in your responses. """
            }
            chat_history = [system_message]
        # wandb.init(
        #     project="resume-rag", 
        #     name="zerogpu-run",
        #     save_code=True,
        #     config={
        #         "model_name": model_name,
        #         "resume_url": RESUME_URL
        #     }
        # )
        # Append current user message to chat history
        current_message = {
            "role": "user",
            "content": message
        }
        chat_history.append(current_message)

        # start_time = time.time()
        # Generate LLM answer
        generated_text = generate_answer(chat_history)
        # generated_text = onnx_inference(chat_history, rag_query, tokenizer)

        # Detect if tool call is requested by LLM. If yes, then
        # execute tool and use else return None
        query_results, tool_query = parse_tool_request(generated_text)

        # If tool call was requested
        if query_results is not None and tool_query is not None:
            # Update chat history with result of tool call
            chat_history = update_chat_history(
                chat_history, tool_query, query_results
            )
            # Generate result from the
            generated_text = generate_answer(chat_history)
            # generated_text = onnx_inference(chat_history, rag_query, tokenizer)

        # metrics = {
        #     "conversation": {
        #         "turn": len(chat_history) // 2,
        #         "history": chat_history,
        #         "current_question": message,
        #         "current_answer": generated_text[:-10],
        #         "tool_query": tool_query,
        #         "rag_results": query_results
        #     },
        #     "performance": {
        #         "response_time": time.time() - start_time,
        #         "gpu_memory_used": torch.cuda.memory_allocated() if torch.cuda.is_available() else 0,
        #         "cpu_memory": psutil.Process().memory_info().rss,
        #         # "gpu_utilization": torch.cuda.utilization() if torch.cuda.is_available() else 0
        #     }
        # }
        # wandb.log(metrics)

        return generated_text[:-10]

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        # quantization_config=QuantoConfig(
        #     weights="int8",
        # )
        # quantization_config = BitsAndBytesConfig(
        #     load_in_8bit=True,
        #     # bnb_4bit_compute_dtype=torch.float16,
        #     # bnb_4bit_quant_type="nf4"
        # )
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    demo = gr.ChatInterface(
        fn=rag_process,
        type="messages",
        title="Suvaditya's Personal RAG, a space on ZeroGPU!",
        examples=["Where did Suvaditya complete his Bachelor's Degree?", "Where is Suvaditya currently working?"],
        description="Ask any question about Suvaditya's resume and get an answer! \n\nNote: Sometimes, as always, the LLM may give wrong answers. Here's a link to my [resume](https://suvadityamuk.com/uploads/resume.pdf), if you'd like to go through it yourself! Get in touch with me through [X](https://x.com/halcyonrayes), [Gmail](mailto:suvadityamuk@gmail.com), [LinkedIn](https://www.linkedin.com/in/suvadityamukherjee), or [schedule a meeting with me here](https://cal.com/suvadityamuk)",
        theme="John6666/YntecDark",
    )
    demo.launch()   

    # wandb.finish()