Spaces:
Sleeping
Sleeping
Commit
·
b9edf3f
0
Parent(s):
First version
Browse files- .DS_Store +0 -0
- .gitignore +4 -0
- app.py +95 -0
- find_vtt.py +37 -0
- merged_vtt_data_with_questions_from_file.csv +75 -0
- mvp.py +109 -0
- requirements.txt +176 -0
- send_openai.py +50 -0
- vtt_process.py +112 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/*
|
2 |
+
.my_db/*
|
3 |
+
vtt_files/*
|
4 |
+
venv/*
|
app.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import requests
|
3 |
+
from mvp import tbl
|
4 |
+
import sqlite3
|
5 |
+
|
6 |
+
# # Function to call your system API and get the response
|
7 |
+
# def search_question(question):
|
8 |
+
# # Replace 'your_system_api_url' with the actual API endpoint
|
9 |
+
# api_url = 'your_system_api_url'
|
10 |
+
# try:
|
11 |
+
# response = requests.post(api_url, json={"question": question})
|
12 |
+
# response.raise_for_status() # Raise an HTTPError for bad responses
|
13 |
+
# return response.json().get('answer', 'No answer found.')
|
14 |
+
# except Exception as e:
|
15 |
+
# return f"Error: {str(e)}"
|
16 |
+
|
17 |
+
# Function to handle feedback submission
|
18 |
+
def handle_feedback(feedback, additional_feedback):
|
19 |
+
# Process the feedback as needed
|
20 |
+
return "Thank you for your feedback!"
|
21 |
+
|
22 |
+
from mvp import search_question
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
# Define the Gradio interface
|
27 |
+
def gradio_app():
|
28 |
+
with gr.Blocks() as demo:
|
29 |
+
|
30 |
+
# Header
|
31 |
+
gr.Markdown("# RizzCon Anwering Machine")
|
32 |
+
# Textbox for question input
|
33 |
+
question_input = gr.Textbox(label="Enter your question:", lines=2, placeholder="Type your question here...")
|
34 |
+
|
35 |
+
# Button to submit question
|
36 |
+
search_button = gr.Button("Search")
|
37 |
+
|
38 |
+
# Output box for showing the result
|
39 |
+
# Output box for showing the result
|
40 |
+
# result_output = gr.Dataframe(label="Answers",wrap = True, headers=["Text", "Filename", "Username"], datatype=["str", "str", "str"], visible=False)
|
41 |
+
result_output = gr.Markdown(label="Answers", visible=False)
|
42 |
+
|
43 |
+
# Feedback question
|
44 |
+
feedback_question = gr.Markdown("Did the result answer your question?", visible=False)
|
45 |
+
|
46 |
+
# Thumbs up and down buttons
|
47 |
+
thumbs_up = gr.Button("👍", visible=False)
|
48 |
+
thumbs_down = gr.Button("👎", visible=False)
|
49 |
+
|
50 |
+
# Optional text box for additional feedback on thumbs down
|
51 |
+
additional_feedback = gr.Textbox(label="Please provide specific feedback:", lines=2, placeholder="Type your feedback here...", visible=False)
|
52 |
+
|
53 |
+
# Submit button for feedback
|
54 |
+
feedback_button = gr.Button("Submit Feedback", visible=False)
|
55 |
+
|
56 |
+
# Function to show the feedback options and answer after search
|
57 |
+
# Function to show the feedback options and answer after search
|
58 |
+
def show_feedback_and_answer(answer):
|
59 |
+
return (
|
60 |
+
gr.update(visible=True), # result_output
|
61 |
+
gr.update(visible=True), # feedback_question
|
62 |
+
gr.update(visible=True), # thumbs_up
|
63 |
+
gr.update(visible=True), # thumbs_down
|
64 |
+
gr.update(visible=True) # feedback_button
|
65 |
+
)
|
66 |
+
|
67 |
+
# Function to show the additional feedback textbox
|
68 |
+
def show_additional_feedback_box():
|
69 |
+
return gr.update(visible=True)
|
70 |
+
|
71 |
+
|
72 |
+
def format_results(docs):
|
73 |
+
return [[doc.text, doc.filename, doc.username] for doc in docs]
|
74 |
+
|
75 |
+
|
76 |
+
search_button.click(
|
77 |
+
fn=search_question,
|
78 |
+
inputs=question_input,
|
79 |
+
outputs=result_output
|
80 |
+
).then(
|
81 |
+
fn=show_feedback_and_answer,
|
82 |
+
inputs=None,
|
83 |
+
outputs=[result_output, feedback_question, thumbs_up, thumbs_down, feedback_button]
|
84 |
+
)
|
85 |
+
|
86 |
+
thumbs_down.click(fn=show_additional_feedback_box, outputs=additional_feedback)
|
87 |
+
|
88 |
+
# Connect the feedback button to the handle_feedback function
|
89 |
+
feedback_button.click(fn=handle_feedback, inputs=[thumbs_down, additional_feedback], outputs=None)
|
90 |
+
|
91 |
+
return demo
|
92 |
+
|
93 |
+
# Launch the Gradio app
|
94 |
+
if __name__ == "__main__":
|
95 |
+
gradio_app().launch()
|
find_vtt.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sqlite3
|
3 |
+
|
4 |
+
# Function to collect all VTT filenames and their folder names
|
5 |
+
def collect_vtt_files(root_dir):
|
6 |
+
vtt_files = []
|
7 |
+
for dirpath, dirnames, filenames in os.walk(root_dir):
|
8 |
+
for file in filenames:
|
9 |
+
if file.endswith('.vtt'):
|
10 |
+
folder_name = os.path.basename(dirpath)
|
11 |
+
vtt_files.append((folder_name, file))
|
12 |
+
return vtt_files
|
13 |
+
|
14 |
+
# Function to save the collected data into an SQLite database
|
15 |
+
def save_to_sqlite(db_path, vtt_files):
|
16 |
+
conn = sqlite3.connect(db_path)
|
17 |
+
cursor = conn.cursor()
|
18 |
+
cursor.execute('''
|
19 |
+
CREATE TABLE IF NOT EXISTS vtt_files (
|
20 |
+
id INTEGER PRIMARY KEY,
|
21 |
+
folder_name TEXT,
|
22 |
+
file_name TEXT
|
23 |
+
)
|
24 |
+
''')
|
25 |
+
|
26 |
+
cursor.executemany('INSERT INTO vtt_files (folder_name, file_name) VALUES (?, ?)', vtt_files)
|
27 |
+
conn.commit()
|
28 |
+
conn.close()
|
29 |
+
|
30 |
+
# Example usage
|
31 |
+
root_dir = '/Users/t0mkaka/Library/CloudStorage/OneDrive-Personal/Siolabs/AI Woodstock' # Replace with the path to your root folder
|
32 |
+
db_path = 'rag.db' # Replace with the path to your SQLite database
|
33 |
+
|
34 |
+
vtt_files = collect_vtt_files(root_dir)
|
35 |
+
save_to_sqlite(db_path, vtt_files)
|
36 |
+
|
37 |
+
print(f"Collected {len(vtt_files)} VTT files and saved to database: {db_path}")
|
merged_vtt_data_with_questions_from_file.csv
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
id,user,text,start_ts,end_ts,is_question
|
2 |
+
0,Hamel Husain,"Benclava is you know, one of the cracked researchers who work at answer. AI you've heard from several researchers from answer I already in this conference Ben has a background in information, retrieval. amongst other things. and has he has an open source package called rag ragatui. which you should check out. It also comes from a deep background and information retrieval and brings that to rag any also one of the clearest thinkers on the topic. but yeah, I'll hand it over to you, Ben. kind of give more color to your background. Anything that I missed. And yeah, we can just jump into it.",00:00:01.422,00:00:46.742,False
|
3 |
+
1,Ben,"Okay, let's go. So I think that's pretty much the key aspect of my background. You pretty much read this slide out. So I do. Yeah, I do. And the atan. So I with, like Jamie, you've seen Joe? No, in this course, and there's like a lot of other awesome people we distributed and the lab. So we do a research, and we try to be as open source as possible because we want people to use what we build prior to joining. And so I did a lot of Nlp. And kind of stumbled upon information retrieval, because it's very, very useful and everybody wants information retrieval. It's more for like clarifying what information retrieval is which I hope today will help. And yeah, my, so my claim to fame or claim to moderate fame at last is the Ragato Library, which makes it like, makes it much easier to use a family of models called Colbert. which we will very briefly mention today, but won't have time to go into detail. But hopefully, like, if you want to know more about that like, do feel free to ping me on this call. I'm generally either very responsive, or you need to ping me again pretty much. I walk, and I also maintain the rear and Cos library, which we will discuss in one of the later slides. And yeah, if you know me, I want to follow me. I want to hear more. But why I do. It's pretty much all on Twitter. I'm not on Linkedin at all. I'm just everything goes through Twitter a lot of memes, and should post. But some very informative stuff once in a while. So so yeah. and let's get started with what we're going to talk about today. And so it's only half an hour, so we're not going to talk about a lot I'm going to talk about. Why, I think out the like call retrieval basics us. They should exist in your pipelines, because rag is a very nebulous term, and that will be the 1st slide, and Hamel will be very happy about that slide, I think. but rags, not a silver bullet rags, not a new thing. From December 2022 rags naive on an N. 20 system will cover that. But I think it's very important to like ground it a bit. When we talk about rag, because it means a lot of different things to different people. Then we will cover basically what we call the compact Mvp. Which is what most people do when they are starting out with rag. It's actually an example from Jeremy. And it's like the simplest possible implementation of rag, as in just using vector, search. And then the other topics are basically thing that I think you should have in your rack, Pipeline as part of your Mvp. And I'll show that like, there's a lot of scary concepts because they're all big walls like Bianca cross and tf, idf, slash. BM25. Filtering. That sounds like a lot. But then I'm going to try and show it. But they are very simple concepts, and you can have pretty much the same Mvp by adding just 10 lines of code, but using, like basically state of the art retrieval components in every bit. and the bonus, which I don't think we'll have time to cover when I try this again was talking about Colbert, because I like talking about Colbert, so I might do it at the end if we have some time. But I might not. And yeah, that's it for the agenda. And then I also think it's important to have the contour agenda, which is what we won't be talking about today, because those are just as important for Rag. But they're not what would put in the very basics, and here will very much about the basics. So one of them is how to monitor and improve rag systems because rags are systems and they're living systems and very much things you should monitor and continuously improve on. I think Jedon covered that quite well in his talk yesterday or last week. Yeah, last week. So I would invite you to watch that and watch Jason and Dan's upcoming course, if it does materialize evaluations. They're also extremely important. But we won't talk about them at all today. But I know that, Joe. We talk about them at length in this talk. benchmarks and paper references. So I'll make a lot of claims that you will just have to trust me on, because I don't want to like have too many references, or too many, like academic looking tables and daystream to keep it quite lively, and a I won't give you a rundown of all the best performing models, and why you should use them. I won't talk about training the augmentation, etc, and I won't talk about all the other cool approaches like split call bear in details because they go beyond the basics. But those are all very important topics. So if you're interested to look up, there's a lot of good resources out there do feel free to ask me. And with that let's get started with the rant. which is my favorite part. So this is a thing that Tamil been doing on Twitter recently as part of his flame posting campaign, I'll say, which is basically. there's so much in AI so much, especially in the Adl. M. World that uses walls that are like a lot scarier than they need to be, and rags probably that because to me when I hear retrieval of message, generation or rag. It sounds like that's an end to end system. That's a very definite set of components. That's a thing that works on its own. And it's not. It's literally just doing retrieval to put stuff into your prompt context like before your prompt, after your prompt, you want to get some context where you're doing retrieval. But that means that's not an end to end system. Despite what Jason will have you believe on his twitter is not created it, but it does make a lot of money from it. And it's basically just the act of stitching together retrieval. So the our part of rug and generation. So the G part of like to ground the letter through like you want to your generation to be granted to you some context. So you're doing rich revol, and whatever documents you have and pass it to your Lm. But there's no magic going on. It's very much like a pipeline that take the output of model A and gives it to model. B. The generation part is what's handled by the large language. Models and good rags are actually 3 different components. It's your good retrieval pipeline. It's a good generative model, and it's a good way of linking them up so it can be formatting your prompt or whatever. and it's very important to think about it when you're saying my rag doesn't work. You need to be more specific, like my rag doesn't. Fork is the same as saying like, car doesn't fork. It's like, Yeah, but something specific is broken. You need to figure out what is it. The retrieval parts is the Lm. Struggling to make use of the context, etc. There's a lot of fail cases there. And with that being said, let's look at what like the compact Mvp. Is. So that is basically what you will see. I think. if you've read any medium block post about the advent of rag in early 2023. That's the pipeline that everyone used, and that's also because the easiest pipeline to bring to production is very simple. You have a query. You have an embedding model. You have documents, the documents get embedded and pulled into a single. Vector. then you do cosine similarity. Search between the vectors for your query and for the documents, and that gets you your results that gets yours call. And this is a bit of a teaser for an upcoming slide. When I say this is called the Bianca approach purchase. So you get the term in mind, and I'll define it, because that's 1 of those things that is like a scary term. That's actually, very, very simple when you break it down. But 1st let's look at what this actually means in code, this whole pipeline. So the 1st thing you want to do is load your model. Then you get your data. you encode it, you store your vectors. Then you get your query. You encode it, and then here we use numpy. You do a cosine similarity search egot product between normalized vectors to get the most similar documents, and the document that I, similar to like the documents whose embedding are similar to your query. Embedding is what you would consider as you relevant documents, you know. and that's pretty much it. That's modified from something that Jeremy did to showcase how simple Rag actually is in his house guide to Lms. But that's what you want to do to retrieve context in the simplest possible way. And you will have noticed that there's no vector dB in this. This is all numpy eyes. And this is all numpy eyes. Because when you use vector dB's. the huge point of using a vector dB is to allow you to efficiently search through a lot of documents, because what a vector dB does generally not all of them. But most of them wrap stuff like at H. And SW. Or Ivf. PQ. Which are indexing types. And what that allows you to do is to find and retrieve relevant documents without having to compute cosine similarity against every single document. It like tries to do an approximate search of an exact search. This is not something that you need. If you're embedding like 500 documents like your CPU, you can do that in milliseconds. You don't actually need a vector dB, if you're trying to go to the simplest possible stage, but if you wanted one, it would go right here on the graph like, right after you unpad your documents. You would put them in the victor diving. And the second thing I think to discuss about is like this tiny graph is. why am I calling embeddings by encoders? Because that step that I call bank order. You will have seen a lot of times, but you will always see genetic call embeddings, or model and Bangkok the term that the I Al attach your uses to refer to that. and it's simply because you encode things separated like you do 2 encoding stages. So it's a buy encoding. And that's used to create single vector presentations where you pre-compute all your documentary presentations. So when you're using biancoders. you uncovered your documents whenever you want like when you're creating your database, when you're adding documents, those get encoded at a time that's completely separate from in France. and then only at in France. Will you like, in the second aspect of this colon? Will you embed your query to compare to your precomputed documentary presentations? So that's really, really computationally efficient. Because at in France you're only ever encoding one thing, which is the query, and everything else has been done before. And so that is part of why it's done that quickly. And I did want to take a slide break, because I can see there are questions, but they're not showing up on my screen. So they're only on like this. Quick! Mvp. Done.",00:00:48.373,00:10:09.153,False
|
4 |
+
2,Dan Becker,"Yeah, let me look through some of the questions. I'm gonna give you a few of them, and you can decide whether you want to take them now or later. So we got one. It's a 7,000 query, a 7,000 question and answer data set. Can I optimize rag to accurately retrieve and quote exact answers. Also, if I think any queries is slightly different from the original data. I think there's actually 2 parts that. So one is to quote the exact answer to something about the is not the information retrieval part. but it's rather just like, what do you tell the Lm. To do But the information chival part is probably well. you see. Oh. you go ahead.",00:10:11.883,00:10:58.122,False
|
5 |
+
3,Ben,I will actually cover how to better deal with out of context things in like an upcoming slide. It's.,00:10:58.123,00:11:05.073,False
|
6 |
+
4,Dan Becker,Add it? Yeah. Why do you keep going? None of these questions.,00:11:05.073,00:11:08.693,False
|
7 |
+
5,Ben,"Yeah, I'll just go back and say. Yeah, perfect. Yeah. Okay, so the next one is, if that's very computationally efficient. there is an obvious trade off here, and that is, your documents are entirely unaware of your query, and your queries are entirely unaware of your documents. which means that you're very, very like subject to how it was trained is basically, if your queries look a bit different from your training data, or if like, if there's very, very specific information that will be in certain documents and not although sometimes you want to know what how the query is phrase. You want to know what the query is looking for when you encoding your document so that it can like kind of paint. That representation represented more towards information that you're interested in. and that's done with what we call rerunking. So rerunking is another one of the scary stages that we'll see in your pipeline, and the most common way to do. Rerunking is using something that we call cross encoder. and cross encoder is another one of the scary walls, like by encoder. That you feel should be like a very advanced concept. But it's actually very simple. This graph here represents the whole difference between them. The Brian Coder is basically this 2 column system that we describe where documents get encoded in that corner, queries get encoded in their own corner, and they only meet very, very late like you only do cosine similarity between vectors. but the documents never seen the query invite, that's our. The cross encoder is different. The cross encoder is a model that will take your document and your query together, so you're going to give it. But your document, or like sales of documents depending the type of model. But to keep it simple. We do it one by one, so you always give it a quaid document pair. and you put you through this cross and coder model, which is effectively a classifier with a single label, and the probability of the label being positive is what your model considers as how similar the documents are, or how relevant it is. This is extremely powerful, because it means that the model knows everything about what you're looking for when it's encoding the document, and can give you a very accurate score or a system more accurate score. The problem is that you can see how that wouldn't scale, because it's not very computationally realistic to compute this like query documents call for every single query document that every time you want to retrieve a document. Say, you've got like Wikipedia embedded you've got, I don't know. Like 10 million paragraphs, you're not gonna compute 10 million scores through a model for like choosing 300 million pound meters for every single document, are you? You would eventually reach out, return something, and it would be a very, very relevant document. But it will also take 15 min, which is probably not what you want in production. So you probably also have had, or you might also have had, if you're ready to retrieval of not hell at all if you're not into retrieval of other rerunking approaches like rank Gpt or Rank Lm. Using lms to run documents has been a big thing lately for people really into retrieval, you will know, if notified, etc. So those are not cross encoders. But that's not really relevant to us, because the core idea the same. And that's basically what we always do with rerunking in the pipeline. You use a powerful model that is computationally expensive to score. Only a subset of your documents, and that's why it's reranking and not ranking, because this can only work if you give it like. I don't know 1050, not more than that document. So you always have a 4 stage retrieval, which here is our vector search. And then the rerun card does the ranking for you. So it creates another list. There's a lot of ways to try those models out. Some of them have an Api base. So it's just an Api call to cohere some of them. You run your machine if you want to try them out. And this is basically the safe promotion per month. I do maintain that until the day I library just called reruncers with the QR code here. where it's basically a unified Api, so you can test any ranking method in your pipeline and swap them out for you. and that's what your pipeline looks like. Now, it's the same with just that one extra step at the end where you rerun things before getting your results. So we've added rerunking. But there's something else that's missing here, and that's something actually addresses the 1st question that is partially is that the semantic search via embeddings is powerful, and I'm not saying, Don't choose. Vectors. Vectors are cool, like models are cool, deep learning is cool. but it's very, very hard if you think about it, because you're asking your model to take. I don't know 512 tokens even more. If you're doing a long context. And you're like, okay, put all of this into this one. Vector we are just using a single vector you've got like I don't know. 384, 1024 most floats, and that must represent all the information in this document. But naturally losing. There's no way you're going to keep all of the information here. and what you do when you're training on embedding is that you're teaching the embedding to replant information that is useful in that training. So the model doesn't learn to represent all of the documents information, because that's pretty much impossible, since our meetings are essentially a form of compression. What the model actually learn is to replant the information that is useful to the training queries. So your training data is very, very important here. It's like replanting the documents in the way that will help you use the queries in the weather phrase in your training data to retrieve a given document. So when you use that on your own data, it's likely that you're going to be missing some information, or when you go slightly out of distribution. There's another thing which is, humans love to use keywords, especially if you're like going into the legal domain, the biomedical domain, anything specific. We have a lot of acronyms that might not even be in the training data. But we use a lot of acronyms. We use a lot of like very advanced medical walls like people love jargon. People love to use technical walls, because those are very, very useful. and that's why you should. And I know it sounds like I'm talking from the 70 s. Because that's actually a method from the 70 s. But you should always have keyword search in your pipeline. You should always also have full text search on top of, like anything that you do with vector. and keyword search, which you can call full text search, or like. Tf, Id. Fbm. 25. It's powered by what we call tf, idf. which is a very basic Nlp concept that essentially stands for term frequency inverse document frequency, and it assigns every single world in a document, or like group of words, because sometimes we do them 2 by 2, a 3 by 3 even it gives them a weight based on how rare they are, so like a wall that appears everywhere like a V or A as a very, very small weight, and a wall that's like highly specific to certain documents at a very high weight. And the main method to use tf, idf for retrieval is called bm, 25, which stands for best matching 25. It was invented in the seventies. It's been updated since then, but basically just been iterations of it. and you'll often hear Ir research. I'll say that the reason that the fields not taken off like Nlp has or computer vision has is because the baseline is just too good, like, we're still competing with the M. 25. And though it's been 50 years now. my God, it's been 50 years. Yeah. So the M. 25 existed. For, like, basically my entire lifetime before my birth. And it's still using production pipeline today. That's how good it is. And the good thing is, it's just world content with a match, with like awaiting formula. So the compute time is virtually unnoticeable like you can add that to your pipeline, you will absolutely never fit it. and I know I said I wouldn't add anything from papers, but I feel like, because I'm making a very strong claim that this method from 70 is strong. I should add a table, and at the table, from the bare paper. which is the retrieval part of Mtb. Which is likely the main embeddings benchmark. And they compared it to a lot of models that were very popular for retrieval, like Dpr. And very strong vector retrievers. And basically you can see that unless you go into very over train embeddings like E. 5 B. Ge. The M. 25 is competitive with virtually all deep learning based approaches, at least at the time of the paper, which was only just 3 years ago. We now have embeddings that are better, but we don't have any embeddings that are better to the point where they're not met better by being using conjunction with Bm. 25. Though. knowing that this is how you want your pipeline to look, you'll notice that there's now a whole new pathway for both the query and the documents. Go on top of being uncoded by the embedded, also encoded by Tf. Idf. to get full text stash, and that will help you like retrieve keyword, etc. Humans use keywords in quiz all the time. It's something you should do at the end. You will combine the scores. It's you can do that in a lot of ways. I won't go into too much details. But what a lot of people do is give a weight of 0 point 7 to the cosine simulated score and 0 point 3 to the full text. But I'm pretty sure we could do a whole talk, for now, on different methods of comparing that. Okay, I do have 5 more minutes. So the last one that you want to add to a simple pipeline the thing that I think really completes your Mvp. Plus is using metadata and using metadata filtering because academic benchmarks don't, because Academy in academic benchmarks. Documents exist mostly in a vacuum like they don't exist in the real world. They're not tied to a specific company, etc. When you're using Ragin production. it's very, very rare that someone comes to you and says, these documents came to me in a dream and caught them like they came from somewhere. They've been generated by a department. They've been generated for a reason. They might be all the excel sheets or whatever, but they are like. They have business sense, or they have like in contact sense. And the metadata is actually sometimes a lot more informative than the document content, especially in rag contexts. So if you take the query here, which is. can you get me the cruise division financial report for Q. 4, 22. There's a lot of ways in which this can go wrong if you're if you're just looking at it from like the semantic, or even using keywords aspect. So when you say, when you see like this, the model must capture the financial report. So you, the model, must figure out you want a financial report, but also cruise division Q. 4 and 2022, and embedding models are bad at numbers. so you might get a financial report that may be for another division, or maybe for the Cruise Division of 1998. It's very hard to just hope that your vector will capture all of this. But there's another Federal case which will happen, especially with weaker Lms. If that if you just have, like top top chaos in top 5 documents, and you retrieve the top 5 documents for your query. Even if your model is very good, if you just let it retrieve the top 5 documents, no matter what. You will end up with financial reports, at least 5 of them, and does most likely only one function for 22. So at that point. You're just passing all 5 to the mother, and being, like good luck, use the right one, which might confuse it, especially because tables can be held, etc. And so I'm not saying that your vector such will fail. But statistically, it will like. In most cases it will fail. And if you don't fail for this query, it will fail for a similar one. But that's actually, very, very easy to mitigate. You just have to like, think outside of the vector and just use more traditional methods. And you can use entity detection models. And one that's very good for this is gleaner, which is a very recent model that does basically 0 shot entity detection. So you give it arbitrary entity types. So like document, type, temp period. And the department. And this is like a live thing of clean, or you can run the demo on the bottom. But here we just extract financial report, empire and department. and when you generate your database for rag. All you need to do is basically specify the time period. So when you get an excel sheet, you will just pass the name for it, or pass the date in it, and give metadata 2024. Q. 2. Like Q. 4. So it's 2022. Q. 2. The Q. 4. Okay, mixed up there. And then you just need to ensure that this is stored alongside your document, and at quite time you can always pre filter your document set to only query things that make sense. So you will only query documents for this relevant time period. So you ensure that even if you give your model the wrong thing, it will at least be as the right timeframe, so it can maybe try and make sense of it. And with this final component. This is what your pipeline looks like. You can see the new component here, which is meta filtering which doesn't apply to queries. Queries go right through it. But documents get filtered by that, and we won't perform search on documents that will not meet the metadata that we want. And okay, I do agree that this looks a lot scalier than the friendly one at the start, which just that you want better and then cosine similarity search and the results. But it is actually not very scary. This is your full pipeline, this implements, everything we've just talked about. It's about 25 lines of code. If you remove the commands. It does look a bit more unfriendly, because there's a lot more like moving parts. I think there's a lot more steps. But if you want, we can just break it down a bit further. And so we use lens. dB, for this. And this is not necessarily an endorsement of lance Dbs of vector, dB, although I do like lens. dB, because it makes all of these components, which are very important, very, very easy to use. But I don't. I tried not to take side in like the vector dB, was because I've used. We have yet. I've used chroma. I've used lands. dB, have you spend Con. They all have that place. But I think, lance dB, if you're trying to build an Mvp is the one should always use phone Vps right now because it had those components built in. And here you can see just how easy it actually is. So we still load the bank holder just in a slightly different way, same as earlier with the final document, metadata. Yeah, it's just a string category. If it could be a timestamp it could be just about anything. Then we uncoord documents just like we did previously. And that's here we've created. So it's not an index. This is still a hard search. This is not an approximate search. then we create a full text search index, which is generating those tf, idf, so why mentioned before we give away to every single time in the documents. Then we load the rerun car. Here. We're using the coherer because it's simple to use an Api. and at the very end you've just got your way and your search where we restrict it to the Category Equals film. So we will only ever search into the document. That's about a film. No, but an author, no, but a director. We get the top 10 results, and we just have a quick ranking step. and that's pretty much it. We've taken the pipeline at the start, which only add the bangkok, or component to a pipeline that now has the bank color component metadata filtering full text search and arianco at the end. So we've had that, like, basically the 4 most important components of retry into a single pipeline. And it really don't take much more space in your code. And yeah, that is pretty much the end of this talk. So there's a lot more to cover in rag. This is definitely not the full call off like. But this is the most important thing like this is what you need to know about how to make a good pipeline very quickly. All the other improvements are very, very valuable, but they have a decreasing cost, effort, ratio. This takes virtually no effort to put in place definitely worth learning about spouse methods. Multi victor methods, because they are very adapted to a lot of situations. Call bear, for instance, is very strong out of domain. Spar is very strong in domain. You should watch Jason's talk about rack systems and Joe's upcoming talk about retrieval evaluations, because those are like your traffic of the most important things. And yeah, any questions. Now.",00:11:08.693,00:26:09.062,False
|
8 |
+
6,Dan Becker,"Hamil and I were just messaging, saying we love this talk at like. It's everything is presented so clearly. So. we've also got quite a few questions. I'm happy to.",00:26:12.703,00:26:29.992,False
|
9 |
+
7,Hamel Husain,"My favorite talk so far. So you know, that's big favorites. But yeah.",00:26:29.993,00:26:35.012,False
|
10 |
+
8,Ben,Thank you. Tom.,00:26:36.113,00:26:37.002,False
|
11 |
+
9,Dan Becker,So go ahead.,00:26:37.493,00:26:38.223,False
|
12 |
+
10,Hamel Husain,"Okay, questions. You were gonna pick. Did you have one that you were looking at already, Dan? I can try it.",00:26:41.348,00:26:47.212,False
|
13 |
+
11,Dan Becker,"Yeah, we've got one that I quite like in the way that you fine tune your buy encoder model affects how you should approach fine tune if you're cross, encoder, and vice versa.",00:26:47.213,00:26:55.712,False
|
14 |
+
12,Ben,"Yes, I don't think I can give like a really comprehensive answer. It will really depend on your domain which would generally want them to be complementary. So if you've got, if you're in a situation where you've got like the compute and the data to function both. you always want to by encoder to be a bit more loose like you wanted to retrieve potential candidates, and then you want to trust your rerun card like your cross encoder, to actually do the filtrain. So if you're going to use both and have full control over both, you might want to intruding in a way that would basically make sure that your top K candidates can be a bit more representative and trust the Ryanka.",00:26:57.953,00:27:32.393,False
|
15 |
+
13,Dan Becker,"Yeah, let me ask. This wasn't an audience question, but related question. You showed us where the when you choose questions to feed into the reranker. That's sort of a weighted average of what you get from the tf, idf, or bm, 25, with what you get from the just simple vector search. What do you think of as the advantages disadvantage of that over saying, we're gonna take the top X from one cat, from one of the rancors and the top X from the others. And that way, if you think one of these is is for some questions especially bad. You have a way of short, short circuiting. Its influence on what gets sent to the the re-recker.",00:27:34.913,00:28:26.672,False
|
16 |
+
14,Ben,"Yeah, I think that also makes complete sense. And that's another also cop out. And so I'll use a lot. But that also depends a lot on your data like a lot of the time you want to look at what's your actual context and how it's actually being used. Because in some situations that actually works better like, especially if you walk with biomedical data because there's so much like specific documents. It's quite often the embedding won't be that amazing on some questions. So you just want to take the top 5 from both and get the Ryanka to do it quickly. The Ryanka is quite aware. So it's a perfectly valid approach to combine them that way. Yeah.",00:28:27.803,00:29:01.462,False
|
17 |
+
15,Dan Becker,You wanna pick your question? Hamil.,00:29:04.033,00:29:06.193,False
|
18 |
+
16,Hamel Husain,"yeah, I've been looking through them. You guys been okay? Jeremy is asking, can we link get a link to the code? Example?",00:29:10.663,00:29:17.899,True
|
19 |
+
17,Ben,With.,00:29:19.492,00:29:19.953,False
|
20 |
+
18,Hamel Husain,Your slides in Maven. We could also. Can I share your slides in discord as well? Ben.,00:29:19.953,00:29:25.282,False
|
21 |
+
19,Ben,"Oh, yes. Please. Yeah.",00:29:25.283,00:29:26.613,False
|
22 |
+
20,Hamel Husain,I'll go ahead and share slides and discord.,00:29:26.613,00:29:28.412,False
|
23 |
+
21,Ben,"And I'll share, like the Github gist, for the code examples of photoch.",00:29:29.373,00:29:32.932,False
|
24 |
+
22,Dan Becker,"And I'll embed the a link to the slides in maven for people who are to talk some point deep into the future, and who might lose track of it in discord. there's a question summary in here. I'll I'll find in a moment. But we've got this question for Jason the speed. And then the speaker, just before you, Paige Bailey said, rag, you know, in the world of 1 million token context length is not going to be as important. What's your take on the relative of importance of rag in the future?",00:29:34.163,00:30:19.462,True
|
25 |
+
23,Ben,"So I'm still very hopeful about Rag in the future, and I think I see it as some some sort of like. So your Lm. To me is like your CPU, and your contacts window will be your RAM. And so like, even if you've got 32 gigs of RAM the bodies ever said, Yeah, throw away your hard drive. You don't need that like in a lot of context, you'll still want to have, like some sort of storage where you can retrieve the relevant documents. Having to use a long context window is not never gonna be a silver bullet just like rags. Never a silver bullet. But I'm actually really happy because it just means I can retrieve much longer documents and get logged more efficient rack systems, because to me it's a bit of a trade off where, if you've got longer context. it just means you've got a lot more freedom with how quick your retrieval system can be, because if it if you need to use top 10 or the top 15, that's fine, you can fit them in, whereas when you kind of lift it, the top 3 documents. You need your retrieval system to be really good, which might mean really slow. So yeah.",00:30:20.552,00:31:14.872,False
|
26 |
+
24,Dan Becker,We had a question from widium. What? What are your thoughts on different chunking strategies?,00:31:26.827,00:31:33.553,True
|
27 |
+
25,Ben,"I probably don't think about chunking as much as I should. I am very hopeful for 2 avenues using Lms. To Pre chunk. I don't think those walk very well right now. That's in my test. I've never been impressed. but also I do tend to use call bear more fund and bank orders and call bears a lot more resistant to chunking. So it's something that I don't care about as much. But generally I would try to. So my go to is always to chunk, based on like around 300 tokens per chunk and try to do it in a way where you never cut off, or sometimes in the middle, and always keep like the last 50 tokens in the next 50 tokens of the previous and next chunk. because information overlap is very useful to give content like, please don't be afraid to duplicate information in your chunks.",00:31:35.989,00:32:20.783,False
|
28 |
+
26,Hamel Husain,"I have a question about the buy encoder. Do you ever try to fine tune that we're using some kind of like label data to get that to be really good? Or do you usually kind of use that off the shelf and then use a re-ranker? And you know, how do you usually go about it? Or how do you make the trade off.",00:32:22.485,00:32:42.613,False
|
29 |
+
27,Ben,"So again, context dependant. But if you have data, you should always function. All you want us be the band code or the cross and coda, I think, call bear. Because single vector, you can get away with not functioning for a bit longer, because multi vector, so you can get away with not functioning for a bit longer. But if you have data, it's all about like basically the resources you have. So in this talk, we're doing an Mvp. Is something you can put together in an afternoon. If your company says you have $500, spend 480 of that on open AI to generate synthetic question and find you on your on Kodos. That will always get you better results like always venturing if you can. And so yes, so a couple of questions about fitting colbare in. And I'm using presenter executive decision to answer those. So call bear in this pipeline. Some people use it as a rerun car. But then that's not optimal. That's very much. When you don't want to have to change your existing pipeline. If you were to design the pipeline from scratch and wanted to use. Call back, you would have it instead of the Biancoda. and it will perform basically the same role as the bank order, which is 1st edge retrieval. and if you wanted to use call bear, and especially if you don't have the budget to function and need the rerunking step. Sometimes it can actually be better to use call bear. So rerun cost. Still. because the multi vector, approach can be better, like capturing keywords, etc. But that's very context dependent. So ideally, you would have it, as shown by encoder.",00:32:43.623,00:34:20.872,False
|
30 |
+
28,Dan Becker,Or a a lot of people here probably aren't too familiar with Colbert Colbert. Can you? Give the quick summary of it.,00:34:22.793,00:34:32.053,False
|
31 |
+
29,Ben,"Yeah, sorry I got carried away, because so the question so callback is an approach which is effectively by encoder. But instead of cram cramming everything into a single document to a single vector. you represent each document as a bag of embeddings. So like. if you've got 100 tokens instead of having one big 124, vector you will have a lot of small 128 vectors. What? One for each token. And then you will call that at the end. You will do the same for the query. So if you're query 32 tokens, you will have 32 query token. and for each query token, you will compare it to every token in the document, and keep the highest call. and then you will sum up those highest scores, and that will be the score for that given document that's called maximality. And the reason that's so powerful is not because it does very well on data. It's been trained on. Actually like you can beat it with a normal bang coder. But it does very well at extrapolating to out of domain, because you just give the model so much more room to represent each token. It's contact. So it's much easier if, like, you're in a non familiar setting. You've not compressed as much information. and I do have self promotion. I do have a pretty cool call bear thing coming out later this week to compress the call bear space by reducing the tokens it actually needs to save by about 50 to 60% without losing any performance. So that's a bit of a tizer. But look forward to the block post. If you're interested.",00:34:32.483,00:35:54.437,False
|
32 |
+
30,Dan Becker,And to find the blog post you suggest. People follow you on Twitter or.,00:35:57.423,00:36:01.443,False
|
33 |
+
31,Ben,"Yeah, definitely, for on the end without. since it was pretty much the only place where you can reliably reach me.",00:36:02.276,00:36:08.422,False
|
34 |
+
32,Hamel Husain,"Someone's asking, what are some good tools to fine tune embeddings for retrieval? Would you recommend Rag Reggae, or anything else like, what's your.",00:36:14.263,00:36:24.142,False
|
35 |
+
33,Ben,"I'd recommend sentence transformers, especially with the like. 3 point all videos. Recently, it's now much, much friendlier to use. And it's basically there's no need to reinvent the wheel. They've got all the basics implemented very well there. So sentence transformers.",00:36:24.713,00:36:38.132,False
|
36 |
+
34,Dan Becker,Question from divvia. Can you give any pointers on how one fine tunes their embedding model.,00:36:44.463,00:36:51.832,False
|
37 |
+
35,Ben,Sorry. Can you repeat that I got Australian.,00:36:53.313,00:36:55.473,False
|
38 |
+
36,Dan Becker,"Yeah. The question is, can you give any pointers or describe the flow? For when you fine tune your embedding model.",00:36:55.473,00:37:02.293,False
|
39 |
+
37,Ben,"Okay? So that's probably a bit more involved than this talk. But essentially, when you mentioned your embedding model, what you'll want is queries. But you need to have quiz, and you need your documents. And you're gonna tell the model for this given query, these documents for advance. And for this given query, these documents natural events, because sometimes user triplet loss. And the triplet loss is what you will do when you have one positive document in one negative document. and you'll kind of be teaching the model. This is useful. This is not useful. and I'm not gonna go to done too much, because this rabbit hole can take you quite far. But sometimes, when you have triplets, you also want to use what we call harm negatives. which is, you want to actually use retrieval to generate your negative examples, because you want them to be quite close to what the positive example is, but not quite the right thing. because that's where you teach the model more, but was actually useful to answer your pray. So the workflow is probably, as always, look at your data, figure out what kind of quiz your user would actually be doing. If you don't have a user queries. go into production, write some, write some queries yourself, and give that to an Lm. Generate. More queries, and you can have a pretty solid pipeline like that.",00:37:04.103,00:38:14.433,False
|
40 |
+
38,Hamel Husain,"Someone's asking the discord. And I get this question all the time. Is, please share your thoughts on the graph rag.",00:38:16.613,00:38:23.652,False
|
41 |
+
39,Ben,"I have never actually done graph rag. I see this mentioned all the time, but it's not something that has come up for me at all, so I don't have strong files now. I think it's cool, but that's pretty much the full extent of my knowledge.",00:38:25.583,00:38:40.533,False
|
42 |
+
40,Hamel Husain,"Someone's asking. Okay. when? When you have long context windows. does that allow you to do something different with rag like retrieve longer documents, or do any other like different kinds of strategies than you were able to before? Or does it change anything? How you go about this? Yeah.",00:38:49.580,00:39:08.423,False
|
43 |
+
41,Ben,"Yeah, I think it's a bit why I mentioned before to me changes 2 main things. One is, I can use longer documents, which means I can use longer mothers, or I can like teach chunks together, because sometimes. sometimes, if your retrieval model isn't very good at retrieving long documents, which is often the case you might just want. If I go chunk from these documents, give the model the full document like, if I just get the chunk from it pass the full contacts, and you just hope the model is able to read it. And if you've got good long context model, it can. So it changes how you decide to feed the information into the model. And then the other aspect is like, I said, it changes the retrieval overhead, because if you need to be very good like, I was saying, if you need only the top 3 documents to be relevant, you're gonna spend a lot of time and money on your pipeline. if you're like. Oh, as long as my record at 10, or my record at 15 is good, that's fine. You're good. You can't afford to have like much light on others, and spend a lot less time and resources on retrieval. like there's a lot of diminishing returns in retrieval when getting a good record at 10. So like record at 10 is how likely you are to retrieve the relevant document in the 1st 10 results is generally very easy. Recorder 200 is very, very easy. and then record at 5. It's getting held on record at 3 and record at once. I like the really tough ones, because a lot of the training data is noisy. So it's gonna even be hard to know what a good record at one is. The longer context makes that irrelevant. And that's why it's great, for, like.",00:39:08.923,00:40:30.762,False
|
44 |
+
42,Hamel Husain,"Someone's asking, and I don't even know what this means. What's your view on hide versus react versus step back.",00:40:49.053,00:40:54.653,False
|
45 |
+
43,Ben,"I've only used react out of those. And so those are like agentic systems of function coding. So it's like to give you other them. The ability to call tools at this react is. I don't have strong thoughts on those in the context of rich reverse. So I can't really answer the question. Yeah, I think I would occasionally use react from the model to be able to trigger search itself. But I think that's still on the Panay of research. And I think Griffin from answer is also in the in the chat. And he's very interested in that is basically, how do you get a model to tell you that it doesn't know cause. Sometimes you don't need retrieval, the model that already knows. Sometimes you do need retrieval. but that's still a very open question like, How do you decide when to search no strong thoughts there? Yet.",00:40:57.801,00:41:44.892,False
|
46 |
+
44,Dan Becker,"There may or may, you may or may not have a good answer for this one. Is there an end to end? Project, open source project that someone could look at as a way to see or evaluate the difference in result quality when they do result from just buying code or Mvp. And compare that to the final compact. Mvp. Plus plus that you showed.",00:41:48.763,00:42:10.893,False
|
47 |
+
45,Ben,"no, actually, that's a very good point. I don't think there is one that systematically go through every step. and that's probably something that I would like to build at some point or find one, because. or just like most things in retrieval, everything is kind of com convention, always done like you've seen it piece. And this is in a lot of projects, and you just know that that's how it is. But unless you dig deep into the papers, or like, do it yourself. It's quite rare to find very good resources showing that.",00:42:14.073,00:42:42.293,False
|
48 |
+
46,Dan Becker,"Related question, do you have a tutorial that you typically point to people to on fine-tuning fine, tuning their encoder.",00:42:54.758,00:43:05.663,False
|
49 |
+
47,Ben,"That would be the sentence transformers documentation. But it's not the friendliest tutorial. So that's a half answer. That's why we punch you to, but still a bit like up to gain, to sadly.",00:43:07.507,00:43:20.263,False
|
50 |
+
48,Hamel Husain,Wade is asking if you have Goto embedding models.,00:43:40.263,00:43:43.303,False
|
51 |
+
49,Ben,"like, my, go to these days when I'm demoing something, you do cohere one because it's nice to be able to walk with an Api. It works really well, it's cheap. But but other than that, though I would just call bear. If I'm using something in my own pipeline, I would just like multi vectors. But it really depends on the used case, cause you would often find that, like some things, walk well for you, and some things, don't. I do have strong opinions on not using. So if you go to the Mtv. Double board, which is the embedding leaderboard right now, you'll see a lot of Lms as encoders, and I would advise again that because the latency isn't worth it don't need 7 billion parameters to uncode stuff. and at least some of the early ones actually generalize, was like this, remember, from Cohere at the really interesting table where the E 5 mist road was worth an E 5 large, despite being 7 times as big. So probably just stick to like the small ones between like 100, and at most a billion parameters. But that would be my only advice about that. Try all the good ones like Gt. Bg, E. 5.",00:43:48.283,00:44:57.723,False
|
52 |
+
50,Hamel Husain,"Chris Levy is asking this question about elastic search, which I also get quite a lot. So yes, anyone here have experience building rag application with just keyword. BM. 25. As a retriever at work. It makes use of elastic search. And you said it's all over the tech stack, like people are already using elastic search. Is there? Basically, he's asking, is there a way to keep using elastic search with rag that you know about, or that you've encountered? Or do you mainly use like vector database like lance dB, and things like that? Have you tried seeing people using elastic search and trying to bootstrap off of that.",00:45:00.329,00:45:36.892,False
|
53 |
+
51,Ben,"Yeah, I've used elastic search a bit, and it's perfectly possible you do lose. Obviously the semantic search aspect, although I think now elastic search. Has a vector dB offering so you could add vectors to it. You could always plug in. You could always just do bm, 25, and then plug in the rerun car at the end. That's often, if you read papers on like cross encoders. Ben Ali, the way they evaluate them is actually doing just that like do Bm, 25. To retrieve 50 to 100 documents and then run them, using the rerun car. which, if you can afford to just set up your rerunking pipeline or call the cohere. Api is a really good way to go about it, because you don't need to embed your whole documents to sample how good it would be with deep learning. because there are domains where you do not need the planning. The M. 25 is still good enough in some bits. and you know, like I think it's become very apparent, like bm, 25 has never told anyone they should eat 3 rocks a day. but my biddings have so.",00:45:37.483,00:46:33.573,False
|
54 |
+
52,Hamel Husain,"Dmitry is asking, Is it worthwhile to weigh the 25 a. Bm. 25 similarity score during the ring re ranking step as well.",00:46:35.573,00:46:43.112,False
|
55 |
+
53,Ben,Probably not. You generally just want to use Bm. 25 to retrieve candidates. So you don't need to give those calls to the embedded to your cross and color.,00:46:45.213,00:46:53.622,False
|
56 |
+
54,Dan Becker,"There's a question. I'm going to change it slightly. So someone asks about retrieving from many documents rather than finding the best one, and maybe the tweak there is. If you have a theory that information within any single document is so correlated that you actually want to try and get some diversity. Are you familiar with? Or have you used approaches where you? I specifically try and in some lost function somewhere. Encourage that diversity and encourage pulling from many documents rather than from one.",00:46:59.473,00:47:35.502,False
|
57 |
+
55,Ben,"I have not done that myself. I know that there's different like loss methods to optimize like for diversity versus the accuracy. But I don't think I would be able to give you a clear answer without something really confident about something. I don't know about you, but.",00:47:37.543,00:47:52.252,False
|
58 |
+
56,Dan Becker,Have you used hierarchical rag? Any thoughts on it?,00:47:59.243,00:48:01.813,True
|
59 |
+
57,Ben,"I have not, and I don't think it's very needed, for, like the current pipelines, I think there's a lot other for those steps you can improve.",00:48:03.273,00:48:10.833,False
|
60 |
+
58,Dan Becker,Since I think we have several answer. AI people here. I don't know if this is a question or request. I'm eager to learn if answer AI will come with up with any books on Lm. Applications in the future.,00:48:18.243,00:48:30.323,False
|
61 |
+
59,Ben,"I don't think so, but never say, never. Jamie, if you want to chime in. Yeah, cause I can't make any promises cause my boss is watching so.",00:48:32.973,00:48:45.373,False
|
62 |
+
60,Dan Becker,"You see anything else to? Ben, did you say that you can't see the questions.",00:49:00.073,00:49:05.582,False
|
63 |
+
61,Ben,"Yeah, they're all blank for me. I saw one earlier, but they really show I've spotted a cliff. Pam.",00:49:05.863,00:49:11.113,False
|
64 |
+
62,Dan Becker,"Not sure what's happened with her. and I think people also cannot upvote these so couple of quirks today. You see any others here, Emil, that you think we should pull in.",00:49:11.953,00:49:24.443,False
|
65 |
+
63,Hamel Husain,"no, not necessarily, I think, like probably going to the discord disk. Pretty good. Now.",00:49:26.933,00:49:32.413,False
|
66 |
+
64,Dan Becker,Yep.,00:49:32.593,00:49:33.323,False
|
67 |
+
65,Hamel Husain,"Tons of activity there as well. I mean, there's infinite number of questions. So we we can. I can't. I can't keep going. You like. okay, Laurie is asking, what's the best strategy when chunks. when the documents, when chunks or documents don't fit into the context window. do you do rag in a mapreduce style, summarize aggressively. What are the techniques you've seen work most effectively.",00:49:33.523,00:50:23.902,False
|
68 |
+
66,Ben,"So that's, I think, a very broad question, because it's like. why do they not fit? Is it because, like every documents 3 day long is because you need a lot of different documents etc, etc. So and how also another important aspect is, what's the latency tolerance? Because quite a lot of the time. You can make Rag infinitely better. But users won't stay waiting like 20 seconds for and answer. So you need to figure out like, how much time do I have? One way that you can often see what I've done in production actually is, retrieve the full documents, but have another database that maps every document with semi. So you would have like done your Lm. Summization at the previous step you would retrieve the relevant checks, and then you would pass the relevant surmise to the context window. But that kind of depends on like your actual setting. I have another call at 10, which is in 5 min for me. So if you've got like another final question. this is it.",00:50:25.623,00:51:35.932,False
|
69 |
+
67,Hamel Husain,Really enjoy this presentation.,00:51:36.423,00:51:38.093,False
|
70 |
+
68,Ben,Thank you.,00:51:39.453,00:51:40.179,False
|
71 |
+
69,Dan Becker,"Yeah, this is this is really great. covid. They just super clear and well presented. So thanks so much.",00:51:42.713,00:51:52.993,False
|
72 |
+
70,Ben,Thank you. Chat.,00:51:53.753,00:51:55.033,False
|
73 |
+
71,Hamel Husain,Thank you.,00:51:56.333,00:51:56.913,False
|
74 |
+
72,Ben,Bye.,00:51:57.458,00:51:57.763,False
|
75 |
+
73,Dan Becker,Thanks. Everyone.,00:51:59.293,00:52:00.033,False
|
mvp.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sqlite3
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
+
import lancedb
|
4 |
+
from lancedb.pydantic import LanceModel, Vector
|
5 |
+
from lancedb.embeddings import get_registry
|
6 |
+
from lancedb.rerankers import ColbertReranker
|
7 |
+
from send_openai import get_coherent_answer
|
8 |
+
# Initialize the embedding model
|
9 |
+
model_name = "BAAI/bge-small-en-v1.5"
|
10 |
+
embedding_model = SentenceTransformer(model_name)
|
11 |
+
|
12 |
+
# Create a Model to store attributes for filtering
|
13 |
+
|
14 |
+
# Create a Model to store attributes for filtering
|
15 |
+
# class Document(LanceModel):
|
16 |
+
# text: str = model.SourceField()
|
17 |
+
# vector: Vector(384) = model.VectorField()
|
18 |
+
# category: str
|
19 |
+
|
20 |
+
# Initialise the embedding model
|
21 |
+
model_registry = get_registry().get("sentence-transformers")
|
22 |
+
model = model_registry.create(name="BAAI/bge-small-en-v1.5")
|
23 |
+
|
24 |
+
class Document(LanceModel):
|
25 |
+
text: str = model.SourceField()
|
26 |
+
vector: Vector(384) = model.VectorField()
|
27 |
+
filename: str
|
28 |
+
username: str
|
29 |
+
|
30 |
+
|
31 |
+
|
32 |
+
# Connect to the LanceDB and create a table
|
33 |
+
db = lancedb.connect(".my_db")
|
34 |
+
tbl = db.create_table("my_table", schema=Document, exist_ok=True)
|
35 |
+
|
36 |
+
# Function to read chunks from the SQLite database
|
37 |
+
def read_chunks_from_db(db_path):
|
38 |
+
conn = sqlite3.connect(db_path)
|
39 |
+
cursor = conn.cursor()
|
40 |
+
cursor.execute("SELECT talkname, filename, chunk, username FROM text_chunks")
|
41 |
+
chunks = cursor.fetchall()
|
42 |
+
conn.close()
|
43 |
+
return chunks
|
44 |
+
|
45 |
+
# # Read chunks from the database
|
46 |
+
# db_path = 'rag.db' # Replace with your actual database path
|
47 |
+
# chunks = read_chunks_from_db(db_path)
|
48 |
+
|
49 |
+
# # Prepare documents for embedding and storing in LanceDB
|
50 |
+
# docs = []
|
51 |
+
# for talkname, filename, chunk, username in chunks:
|
52 |
+
# docs.append({
|
53 |
+
# "text": chunk,
|
54 |
+
# "filename": talkname,
|
55 |
+
# "username": username
|
56 |
+
# })
|
57 |
+
|
58 |
+
# # Add documents to the LanceDB table
|
59 |
+
# tbl.add(docs)
|
60 |
+
# # Generate the full-text (tf-idf) search index
|
61 |
+
# tbl.create_fts_index("text")
|
62 |
+
# Initialise a reranker
|
63 |
+
reranker = ColbertReranker()
|
64 |
+
|
65 |
+
# Function to log the interaction to the SQLite database
|
66 |
+
def log_interaction(question, chunks, coherent_answer, feedback=None, additional_feedback=None, db_path='rag.db'):
|
67 |
+
conn = sqlite3.connect(db_path)
|
68 |
+
cursor = conn.cursor()
|
69 |
+
cursor.execute('''
|
70 |
+
INSERT INTO interactions (question, chunks, coherent_answer, feedback, additional_feedback)
|
71 |
+
VALUES (?, ?, ?, ?, ?)
|
72 |
+
''', (question, "\n\n".join(chunks), coherent_answer, feedback, additional_feedback))
|
73 |
+
conn.commit()
|
74 |
+
conn.close()
|
75 |
+
|
76 |
+
|
77 |
+
# Function to call your system API and get the response
|
78 |
+
def search_question(question):
|
79 |
+
query = question
|
80 |
+
results = (tbl.search(query, query_type="hybrid") # Hybrid means text + vector
|
81 |
+
.limit(10) # Get 10 results from first-pass retrieval
|
82 |
+
.rerank(reranker=reranker))
|
83 |
+
|
84 |
+
if results:
|
85 |
+
documents = results.to_pydantic(Document)
|
86 |
+
context_chunks = [doc.text for doc in documents]
|
87 |
+
result = get_coherent_answer(question, context_chunks)
|
88 |
+
log_interaction(question, context_chunks, result)
|
89 |
+
return result
|
90 |
+
else:
|
91 |
+
return "No answer found."
|
92 |
+
# if results:
|
93 |
+
# documents = results.to_pydantic(Document)
|
94 |
+
# return [[doc.text, doc.filename, doc.username] for doc in documents]
|
95 |
+
# else:
|
96 |
+
# return []
|
97 |
+
|
98 |
+
# # Define the query
|
99 |
+
# query = "What is Chihiro's new name given to her by the witch?"
|
100 |
+
|
101 |
+
# # Perform the search and rerank the results
|
102 |
+
# results = (tbl.search(query, query_type="hybrid") # Hybrid means text + vector
|
103 |
+
# .limit(10) # Get 10 results from first-pass retrieval
|
104 |
+
# .rerank(reranker=reranker)
|
105 |
+
# )
|
106 |
+
|
107 |
+
# # Print the results
|
108 |
+
# print(results)
|
109 |
+
# print(results.to_pydantic(Document))
|
requirements.txt
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohttp==3.9.5
|
2 |
+
aiosignal==1.3.1
|
3 |
+
aiostream==0.5.2
|
4 |
+
altgraph @ file:///home/conda/feedstock_root/build_artifacts/altgraph_1632761154050/work
|
5 |
+
annotated-types==0.6.0
|
6 |
+
anyio==4.3.0
|
7 |
+
appnope @ file:///Users/runner/miniforge3/conda-bld/appnope_1610094662606/work
|
8 |
+
argon2-cffi @ file:///Users/runner/miniforge3/conda-bld/argon2-cffi_1625821429979/work
|
9 |
+
async-generator==1.10
|
10 |
+
async-timeout==4.0.3
|
11 |
+
attrs @ file:///home/conda/feedstock_root/build_artifacts/attrs_1620387926260/work
|
12 |
+
backcall @ file:///home/conda/feedstock_root/build_artifacts/backcall_1592338393461/work
|
13 |
+
backports.functools-lru-cache @ file:///home/conda/feedstock_root/build_artifacts/backports.functools_lru_cache_1618230623929/work
|
14 |
+
beautifulsoup4==4.10.0
|
15 |
+
bleach @ file:///home/conda/feedstock_root/build_artifacts/bleach_1629908509068/work
|
16 |
+
boto3 @ file:///home/conda/feedstock_root/build_artifacts/boto3_1639467547608/work
|
17 |
+
botocore @ file:///home/conda/feedstock_root/build_artifacts/botocore_1639449679425/work
|
18 |
+
brotlipy==0.7.0
|
19 |
+
cachetools==5.0.0
|
20 |
+
certifi==2022.9.24
|
21 |
+
cffi @ file:///Users/runner/miniforge3/conda-bld/cffi_1666184005339/work
|
22 |
+
chardet @ file:///Users/runner/miniforge3/conda-bld/chardet_1610093454858/work
|
23 |
+
click==8.1.7
|
24 |
+
conda==22.9.0
|
25 |
+
conda-package-handling @ file:///Users/runner/miniforge3/conda-bld/conda-package-handling_1618231726924/work
|
26 |
+
cryptography @ file:///Users/runner/miniforge3/conda-bld/cryptography_1616851494586/work
|
27 |
+
cycler==0.10.0
|
28 |
+
debugpy @ file:///Users/runner/miniforge3/conda-bld/debugpy_1627074942557/work
|
29 |
+
decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1631346842025/work
|
30 |
+
defusedxml @ file:///home/conda/feedstock_root/build_artifacts/defusedxml_1615232257335/work
|
31 |
+
distlib==0.3.6
|
32 |
+
distro==1.9.0
|
33 |
+
dnspython==2.6.1
|
34 |
+
email-validator==2.1.1
|
35 |
+
entrypoints @ file:///home/conda/feedstock_root/build_artifacts/entrypoints_1605121927639/work/dist/entrypoints-0.3-py2.py3-none-any.whl
|
36 |
+
exceptiongroup==1.2.0
|
37 |
+
fastapi==0.111.0
|
38 |
+
fastapi-cli==0.0.4
|
39 |
+
feedparser==6.0.10
|
40 |
+
filelock==3.8.0
|
41 |
+
Flask==2.0.2
|
42 |
+
frozenlist==1.4.1
|
43 |
+
fsspec==2021.11.0
|
44 |
+
google-auth==2.6.2
|
45 |
+
google-auth-oauthlib==0.5.1
|
46 |
+
grpclib==0.4.7
|
47 |
+
h11==0.14.0
|
48 |
+
h2==4.1.0
|
49 |
+
hpack==4.0.0
|
50 |
+
html2text==2020.1.16
|
51 |
+
httpcore==1.0.4
|
52 |
+
httptools==0.6.1
|
53 |
+
httpx==0.27.0
|
54 |
+
hyperframe==6.0.1
|
55 |
+
idna @ file:///home/conda/feedstock_root/build_artifacts/idna_1593328102638/work
|
56 |
+
importlib-metadata @ file:///Users/runner/miniforge3/conda-bld/importlib-metadata_1630267518765/work
|
57 |
+
iniconfig==2.0.0
|
58 |
+
ipykernel @ file:///Users/runner/miniforge3/conda-bld/ipykernel_1631291133690/work/dist/ipykernel-6.4.1-py3-none-any.whl
|
59 |
+
ipython @ file:///Users/runner/miniforge3/conda-bld/ipython_1630172565945/work
|
60 |
+
ipython-genutils==0.2.0
|
61 |
+
ipywidgets @ file:///home/conda/feedstock_root/build_artifacts/ipywidgets_1631590360471/work
|
62 |
+
itsdangerous==2.0.1
|
63 |
+
jedi @ file:///Users/runner/miniforge3/conda-bld/jedi_1610146802570/work
|
64 |
+
Jinja2 @ file:///home/conda/feedstock_root/build_artifacts/jinja2_1621419064915/work
|
65 |
+
jmespath @ file:///home/conda/feedstock_root/build_artifacts/jmespath_1589369830981/work
|
66 |
+
jsonschema @ file:///home/conda/feedstock_root/build_artifacts/jsonschema_1614815863336/work
|
67 |
+
jupyter @ file:///Users/runner/miniforge3/conda-bld/jupyter_1611871906259/work
|
68 |
+
jupyter-client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1631893672246/work
|
69 |
+
jupyter-console @ file:///home/conda/feedstock_root/build_artifacts/jupyter_console_1616560109969/work
|
70 |
+
jupyter-core @ file:///Users/runner/miniforge3/conda-bld/jupyter_core_1631852739280/work
|
71 |
+
jupyterlab-pygments @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_pygments_1601375948261/work
|
72 |
+
jupyterlab-widgets @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_widgets_1631590465624/work
|
73 |
+
kiwisolver @ file:///Users/runner/miniforge3/conda-bld/kiwisolver_1630173773931/work
|
74 |
+
lxml==4.7.1
|
75 |
+
macholib @ file:///home/conda/feedstock_root/build_artifacts/macholib_1595268426755/work
|
76 |
+
markdown-it-py==3.0.0
|
77 |
+
MarkupSafe @ file:///Users/runner/miniforge3/conda-bld/markupsafe_1621455750847/work
|
78 |
+
matplotlib @ file:///Users/runner/miniforge3/conda-bld/matplotlib-suite_1632416631285/work
|
79 |
+
matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1631080358261/work
|
80 |
+
mdurl==0.1.2
|
81 |
+
mistune @ file:///Users/runner/miniforge3/conda-bld/mistune_1624941595202/work
|
82 |
+
modal==0.62.201
|
83 |
+
multidict==6.0.5
|
84 |
+
nbclient @ file:///home/conda/feedstock_root/build_artifacts/nbclient_1629120697898/work
|
85 |
+
nbconvert @ file:///Users/runner/miniforge3/conda-bld/nbconvert_1631672256401/work
|
86 |
+
nbformat @ file:///home/conda/feedstock_root/build_artifacts/nbformat_1617383142101/work
|
87 |
+
nest-asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1617163391303/work
|
88 |
+
notebook @ file:///home/conda/feedstock_root/build_artifacts/notebook_1631733685426/work
|
89 |
+
numpy @ file:///Users/runner/miniforge3/conda-bld/numpy_1624399908429/work
|
90 |
+
oauthlib==3.1.1
|
91 |
+
olefile @ file:///home/conda/feedstock_root/build_artifacts/olefile_1602866521163/work
|
92 |
+
openai==1.12.0
|
93 |
+
orjson==3.10.3
|
94 |
+
packaging @ file:///home/conda/feedstock_root/build_artifacts/packaging_1625323647219/work
|
95 |
+
pandas==1.3.0
|
96 |
+
pandocfilters @ file:///home/conda/feedstock_root/build_artifacts/pandocfilters_1631603243851/work
|
97 |
+
parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1617148930513/work
|
98 |
+
patsy @ file:///home/conda/feedstock_root/build_artifacts/patsy_1632667180946/work
|
99 |
+
pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1602535608087/work
|
100 |
+
pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1602536217715/work
|
101 |
+
Pillow @ file:///Users/runner/miniforge3/conda-bld/pillow_1630696664467/work
|
102 |
+
platformdirs==2.5.2
|
103 |
+
pluggy==1.5.0
|
104 |
+
prometheus-client @ file:///home/conda/feedstock_root/build_artifacts/prometheus_client_1622586138406/work
|
105 |
+
prompt-toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1629903925368/work
|
106 |
+
protobuf==4.25.3
|
107 |
+
psycopg2 @ file:///Users/runner/miniforge3/conda-bld/psycopg2-split_1667025658980/work
|
108 |
+
ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1609419310487/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
|
109 |
+
pyasn1==0.4.8
|
110 |
+
pyasn1-modules==0.2.8
|
111 |
+
pycosat @ file:///Users/runner/miniforge3/conda-bld/pycosat_1610094841871/work
|
112 |
+
pycparser @ file:///home/conda/feedstock_root/build_artifacts/pycparser_1593275161868/work
|
113 |
+
pycurl==7.44.1
|
114 |
+
pydantic==2.6.2
|
115 |
+
pydantic-core==2.16.3
|
116 |
+
pygments==2.18.0
|
117 |
+
pyinstaller @ file:///Users/runner/miniforge3/conda-bld/pyinstaller_1635565314309/work
|
118 |
+
pyinstaller-hooks-contrib @ file:///home/conda/feedstock_root/build_artifacts/pyinstaller-hooks-contrib_1629940154869/work
|
119 |
+
pyOpenSSL @ file:///home/conda/feedstock_root/build_artifacts/pyopenssl_1608055815057/work
|
120 |
+
pyparsing==2.4.7
|
121 |
+
pyrsistent @ file:///Users/runner/miniforge3/conda-bld/pyrsistent_1624984668772/work
|
122 |
+
PySocks @ file:///Users/runner/miniforge3/conda-bld/pysocks_1610291460269/work
|
123 |
+
pytest==8.2.2
|
124 |
+
python-dateutil==2.8.1
|
125 |
+
python-docx==1.1.2
|
126 |
+
python-dotenv==1.0.1
|
127 |
+
python-multipart==0.0.9
|
128 |
+
pytz @ file:///home/conda/feedstock_root/build_artifacts/pytz_1612179539967/work
|
129 |
+
PyYAML==6.0.1
|
130 |
+
pyzmq @ file:///Users/runner/miniforge3/conda-bld/pyzmq_1631793572519/work
|
131 |
+
replicate==0.26.0
|
132 |
+
requests @ file:///home/conda/feedstock_root/build_artifacts/requests_1608156231189/work
|
133 |
+
requests-oauthlib==1.3.0
|
134 |
+
rich==13.7.1
|
135 |
+
rsa==4.8
|
136 |
+
ruamel-yaml-conda @ file:///Users/runner/miniforge3/conda-bld/ruamel_yaml_1611943386799/work
|
137 |
+
s3transfer @ file:///home/conda/feedstock_root/build_artifacts/s3transfer_1626384238958/work
|
138 |
+
scipy @ file:///Users/runner/miniforge3/conda-bld/scipy_1633644994244/work
|
139 |
+
seaborn @ file:///home/conda/feedstock_root/build_artifacts/seaborn-split_1629095986539/work
|
140 |
+
Send2Trash @ file:///home/conda/feedstock_root/build_artifacts/send2trash_1628511208346/work
|
141 |
+
sgmllib3k==1.0.0
|
142 |
+
shellingham==1.5.4
|
143 |
+
sigtools==4.0.1
|
144 |
+
six @ file:///home/conda/feedstock_root/build_artifacts/six_1620240208055/work
|
145 |
+
sniffio==1.3.0
|
146 |
+
soupsieve==2.3.1
|
147 |
+
starlette==0.37.2
|
148 |
+
statsmodels @ file:///Users/runner/miniforge3/conda-bld/statsmodels_1633292317174/work
|
149 |
+
synchronicity==0.6.7
|
150 |
+
terminado @ file:///Users/runner/miniforge3/conda-bld/terminado_1631128239869/work
|
151 |
+
testpath @ file:///home/conda/feedstock_root/build_artifacts/testpath_1621261527237/work
|
152 |
+
toml==0.10.2
|
153 |
+
tomli==2.0.1
|
154 |
+
toolz @ file:///home/conda/feedstock_root/build_artifacts/toolz_1657485559105/work
|
155 |
+
tornado @ file:///Users/runner/miniforge3/conda-bld/tornado_1625488956802/work
|
156 |
+
tqdm @ file:///home/conda/feedstock_root/build_artifacts/tqdm_1621890532941/work
|
157 |
+
traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1630423529112/work
|
158 |
+
tweepy==4.0.0
|
159 |
+
typer==0.12.3
|
160 |
+
types-certifi==2021.10.8.3
|
161 |
+
types-toml==0.10.8.20240310
|
162 |
+
typing-extensions==4.9.0
|
163 |
+
ujson==5.10.0
|
164 |
+
urllib3 @ file:///home/conda/feedstock_root/build_artifacts/urllib3_1615828766818/work
|
165 |
+
uvicorn==0.30.1
|
166 |
+
uvloop==0.19.0
|
167 |
+
virtualenv==20.16.5
|
168 |
+
watchfiles==0.22.0
|
169 |
+
wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1600965781394/work
|
170 |
+
webencodings==0.5.1
|
171 |
+
websockets==12.0
|
172 |
+
Werkzeug==2.0.2
|
173 |
+
widgetsnbextension @ file:///Users/runner/miniforge3/conda-bld/widgetsnbextension_1605475516388/work
|
174 |
+
wptools==0.4.17
|
175 |
+
yarl==1.9.4
|
176 |
+
zipp @ file:///home/conda/feedstock_root/build_artifacts/zipp_1625284368454/work
|
send_openai.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
from openai import OpenAI
|
4 |
+
|
5 |
+
client = OpenAI(api_key='XXXXX')
|
6 |
+
import json
|
7 |
+
|
8 |
+
# Define your OpenAI API key
|
9 |
+
|
10 |
+
# Function to anonymize content
|
11 |
+
def get_coherent_answer(question, documents):
|
12 |
+
prompt = f"""
|
13 |
+
You are Jeremy howard. A tech educator with top to bottom explaining style.
|
14 |
+
A user has asked a question and in return some text chunks are provided to you.
|
15 |
+
Based on the question and text chunks write your answer. Use lists or styles to explain for text mode.
|
16 |
+
The question is : {question} \n
|
17 |
+
The text chunks are : {
|
18 |
+
[doc for doc in documents]
|
19 |
+
}
|
20 |
+
"""
|
21 |
+
|
22 |
+
print(f"prompt: {prompt}")
|
23 |
+
response = client.chat.completions.create(model="gpt-3.5-turbo",
|
24 |
+
messages = [
|
25 |
+
{"role":"user", "content":prompt}
|
26 |
+
]
|
27 |
+
)
|
28 |
+
print(response.choices[0].message.content)
|
29 |
+
return response.choices[0].message.content.strip()
|
30 |
+
|
31 |
+
# # Read the text file
|
32 |
+
# with open('anonymized_texts.txt', 'r', encoding='utf-8') as file:
|
33 |
+
# lines = file.readlines()
|
34 |
+
|
35 |
+
# # Process each line and save the results
|
36 |
+
# result = None
|
37 |
+
# with open('output.jsonl', 'w', encoding='utf-8') as jsonl_file:
|
38 |
+
# for line in lines:
|
39 |
+
# if line.strip(): # Ignore empty lines
|
40 |
+
# try:
|
41 |
+
# anonymized_result = anonymize_content(line.strip())
|
42 |
+
# result = json.loads(anonymized_result)
|
43 |
+
# jsonl_file.write(json.dumps(result, ensure_ascii=False) + '\n')
|
44 |
+
# except Exception as e:
|
45 |
+
# print(f"Error processing line: {line.strip()}")
|
46 |
+
# print(f"Result : {result}" )
|
47 |
+
# print(f"Error: {e}")
|
48 |
+
|
49 |
+
# print("Anonymization complete. Results saved to output.jsonl")
|
50 |
+
|
vtt_process.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sqlite3
|
3 |
+
import pandas as pd
|
4 |
+
from nltk.tokenize import word_tokenize
|
5 |
+
import re
|
6 |
+
|
7 |
+
# Function to chunk text into specified size with overlap and keep track of timestamps
|
8 |
+
def chunk_text_with_timestamps(text, start_ts, end_ts, chunk_size=256, overlap=0.5):
|
9 |
+
words = word_tokenize(text)
|
10 |
+
chunks = []
|
11 |
+
step = int(chunk_size * (1 - overlap))
|
12 |
+
num_chunks = (len(words) - chunk_size + step) // step
|
13 |
+
|
14 |
+
for i in range(0, num_chunks * step, step):
|
15 |
+
chunk = words[i:i + chunk_size]
|
16 |
+
if len(chunk) < chunk_size:
|
17 |
+
break
|
18 |
+
chunk_start_ts = start_ts
|
19 |
+
chunk_end_ts = end_ts # Placeholder for real calculation, you might need to calculate it based on word timings
|
20 |
+
chunks.append((' '.join(chunk), chunk_start_ts, chunk_end_ts))
|
21 |
+
|
22 |
+
return chunks
|
23 |
+
|
24 |
+
# Function to read VTT files from the database
|
25 |
+
def read_vtt_files_from_db(db_path):
|
26 |
+
conn = sqlite3.connect(db_path)
|
27 |
+
cursor = conn.cursor()
|
28 |
+
cursor.execute("SELECT folder_name, file_name FROM vtt_files")
|
29 |
+
vtt_files = cursor.fetchall()
|
30 |
+
conn.close()
|
31 |
+
return vtt_files
|
32 |
+
|
33 |
+
|
34 |
+
# Function to process VTT file and extract chunks with timestamps
|
35 |
+
def process_vtt_file(file_path, chunk_size=256, overlap=0.5):
|
36 |
+
with open(file_path, 'r') as file:
|
37 |
+
vtt_data = file.read()
|
38 |
+
|
39 |
+
# Regular expression to match the VTT format
|
40 |
+
pattern = r"(\d+)\n(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\n(.*?): (.*?)\n"
|
41 |
+
matches = re.findall(pattern, vtt_data, re.DOTALL)
|
42 |
+
|
43 |
+
# Merge text by the same user and create chunks with timestamps
|
44 |
+
data = []
|
45 |
+
current_user = None
|
46 |
+
current_text = ""
|
47 |
+
start_ts = None
|
48 |
+
|
49 |
+
for match in matches:
|
50 |
+
id, start, end, user, text = match
|
51 |
+
if user != current_user:
|
52 |
+
if current_user is not None:
|
53 |
+
data.append((current_user, current_text, start_ts, previous_end))
|
54 |
+
current_user = user
|
55 |
+
current_text = text
|
56 |
+
start_ts = start
|
57 |
+
else:
|
58 |
+
current_text += " " + text
|
59 |
+
previous_end = end
|
60 |
+
|
61 |
+
if current_user is not None:
|
62 |
+
data.append((current_user, current_text, start_ts, previous_end))
|
63 |
+
|
64 |
+
chunks = []
|
65 |
+
chunk_id = 1
|
66 |
+
for user, text, start_ts, end_ts in data:
|
67 |
+
text_chunks = chunk_text_with_timestamps(text, start_ts, end_ts, chunk_size, overlap)
|
68 |
+
for chunk, chunk_start_ts, chunk_end_ts in text_chunks:
|
69 |
+
chunks.append((chunk_id, chunk, chunk_start_ts, chunk_end_ts, user))
|
70 |
+
chunk_id += 1
|
71 |
+
|
72 |
+
return chunks
|
73 |
+
|
74 |
+
# Function to save chunks to the database
|
75 |
+
def save_chunks_to_db(db_path, folder_name, file_name, chunks):
|
76 |
+
conn = sqlite3.connect(db_path)
|
77 |
+
cursor = conn.cursor()
|
78 |
+
cursor.execute('''
|
79 |
+
CREATE TABLE IF NOT EXISTS text_chunks (
|
80 |
+
id INTEGER PRIMARY KEY,
|
81 |
+
talkname TEXT,
|
82 |
+
filename TEXT,
|
83 |
+
chunkid INTEGER,
|
84 |
+
chunk TEXT,
|
85 |
+
start_ts TEXT,
|
86 |
+
end_ts TEXT,
|
87 |
+
username TEXT
|
88 |
+
)
|
89 |
+
''')
|
90 |
+
|
91 |
+
for chunk_id, chunk, chunk_start_ts, chunk_end_ts, user in chunks:
|
92 |
+
cursor.execute('''
|
93 |
+
INSERT INTO text_chunks (talkname, filename, chunkid, chunk, start_ts, end_ts, username)
|
94 |
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
95 |
+
''', (folder_name, file_name, chunk_id, chunk, chunk_start_ts, chunk_end_ts, user))
|
96 |
+
|
97 |
+
conn.commit()
|
98 |
+
conn.close()
|
99 |
+
|
100 |
+
# Main script to process all VTT files and save chunks
|
101 |
+
root_dir = '/Users/t0mkaka/Desktop/Network/vtt_files' # Replace with the path to your root folder
|
102 |
+
db_path = 'rag.db' # Replace with the path to your SQLite database
|
103 |
+
|
104 |
+
vtt_files = read_vtt_files_from_db(db_path)
|
105 |
+
|
106 |
+
for folder_name, file_name in vtt_files:
|
107 |
+
file_path = os.path.join(root_dir, file_name)
|
108 |
+
if os.path.exists(file_path):
|
109 |
+
chunks = process_vtt_file(file_path)
|
110 |
+
save_chunks_to_db(db_path, folder_name, file_name, chunks)
|
111 |
+
|
112 |
+
print("Processed and saved all text chunks with timestamps and usernames to the database.")
|