itsskofficial commited on
Commit
7018286
·
1 Parent(s): f0298af

added util files

Browse files
Files changed (2) hide show
  1. test.py +78 -0
  2. upload_data.py +89 -0
test.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # test_retrieval.py
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from langchain_community.vectorstores import SupabaseVectorStore
6
+ from supabase.client import Client, create_client
7
+
8
+ def test_retrieval():
9
+ """
10
+ A simple script to test similarity search on your Supabase vector store.
11
+ """
12
+ # Load environment variables from .env file
13
+ load_dotenv()
14
+
15
+ # --- 1. Connect to the Database ---
16
+ print("Connecting to Supabase...")
17
+ supabase_url = os.environ.get("SUPABASE_URL")
18
+ supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
19
+
20
+ if not supabase_url or not supabase_key:
21
+ print("Error: SUPABASE_URL and SUPABASE_SERVICE_KEY must be set in your .env file.")
22
+ return
23
+
24
+ try:
25
+ supabase: Client = create_client(supabase_url, supabase_key)
26
+ print("Successfully connected to Supabase.")
27
+ except Exception as e:
28
+ print(f"Error connecting to Supabase: {e}")
29
+ return
30
+
31
+ # --- 2. Initialize Embeddings and Vector Store ---
32
+ print("Initializing embeddings model...")
33
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
34
+
35
+ vector_store = SupabaseVectorStore(
36
+ client=supabase,
37
+ embedding=embeddings,
38
+ table_name="documents",
39
+ query_name="match_documents",
40
+ )
41
+ print("Vector store initialized.")
42
+
43
+ # --- 3. Start the Interactive Test Loop ---
44
+ print("\nEnter a question to test the similarity search.")
45
+ print("Type 'exit' or 'quit' to stop the script.\n")
46
+
47
+ while True:
48
+ try:
49
+ # Get user input
50
+ query = input("Question: ")
51
+ if query.lower() in ['exit', 'quit']:
52
+ print("Exiting...")
53
+ break
54
+
55
+ if not query:
56
+ continue
57
+
58
+ # --- 4. Perform the Similarity Search ---
59
+ print("\nSearching for similar documents...")
60
+ # We ask for the top 3 matches (k=3) to get more context
61
+ similar_docs = vector_store.similarity_search_with_relevance_scores(query, k=3)
62
+
63
+ # --- 5. Display the Results ---
64
+ if not similar_docs:
65
+ print("\n--- No similar documents found in the database. ---")
66
+ print("This might mean your database is empty. Please run the data upload cell in test.ipynb.\n")
67
+ else:
68
+ print(f"\n--- Found {len(similar_docs)} similar document(s) ---")
69
+ for i, (doc, score) in enumerate(similar_docs):
70
+ print(f"\n--- Result {i+1} (Similarity Score: {score:.4f}) ---")
71
+ print(doc.page_content)
72
+ print("\n-------------------------------------\n")
73
+
74
+ except Exception as e:
75
+ print(f"An error occurred: {e}")
76
+
77
+ if __name__ == "__main__":
78
+ test_retrieval()
upload_data.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # upload_data.py
2
+ import os
3
+ import json
4
+ from dotenv import load_dotenv
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+ from supabase.client import Client, create_client
7
+
8
+ def upload_data_to_supabase():
9
+ """
10
+ Reads data from metadata.jsonl, generates embeddings,
11
+ and uploads it to a Supabase table named 'documents'.
12
+ """
13
+ # --- 1. Load Environment and Configuration ---
14
+ print("Loading configuration...")
15
+ load_dotenv()
16
+
17
+ supabase_url = os.environ.get("SUPABASE_URL")
18
+ supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
19
+
20
+ if not supabase_url or not supabase_key:
21
+ print("Error: SUPABASE_URL and SUPABASE_SERVICE_KEY must be set in your .env file.")
22
+ return
23
+
24
+ # --- 2. Load the Local Data ---
25
+ print("Loading data from metadata.jsonl...")
26
+ try:
27
+ with open('metadata.jsonl', 'r', encoding='utf-8') as jsonl_file:
28
+ json_list = list(jsonl_file)
29
+
30
+ json_QA = []
31
+ for json_str in json_list:
32
+ json_QA.append(json.loads(json_str))
33
+ print(f"Successfully loaded {len(json_QA)} records from metadata.jsonl.")
34
+ except FileNotFoundError:
35
+ print("Error: metadata.jsonl not found. Make sure it is in the same directory.")
36
+ return
37
+ except Exception as e:
38
+ print(f"Error reading metadata.jsonl: {e}")
39
+ return
40
+
41
+ # --- 3. Initialize Supabase Client and Embeddings Model ---
42
+ print("Connecting to Supabase and initializing embeddings model...")
43
+ try:
44
+ supabase: Client = create_client(supabase_url, supabase_key)
45
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
46
+ print("Connection and initialization successful.")
47
+ except Exception as e:
48
+ print(f"Error during initialization: {e}")
49
+ return
50
+
51
+ # --- 4. Prepare Documents for Upload ---
52
+ print("Preparing documents and generating embeddings. This may take a few minutes...")
53
+ docs_to_upload = []
54
+ for i, sample in enumerate(json_QA):
55
+ # Create the main content string
56
+ content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}"
57
+
58
+ # Create the vector embedding for the content
59
+ embedding = embeddings.embed_query(content)
60
+
61
+ # Create the structured document for upload
62
+ doc = {
63
+ "content": content,
64
+ "metadata": {"source": sample['task_id']}, # This is now a proper JSON object
65
+ "embedding": embedding
66
+ }
67
+ docs_to_upload.append(doc)
68
+
69
+ # Optional: Print progress
70
+ if (i + 1) % 10 == 0:
71
+ print(f"Processed {i + 1}/{len(json_QA)} documents...")
72
+
73
+ print("All documents have been processed.")
74
+
75
+ # --- 5. Upload to Supabase ---
76
+ print("Uploading documents to Supabase...")
77
+ try:
78
+ response = supabase.table("documents").insert(docs_to_upload).execute()
79
+ print("\n--- Success! ---")
80
+ print(f"Successfully uploaded {len(docs_to_upload)} documents to your Supabase table.")
81
+ # The 'response' object from Supabase V2 doesn't contain a simple count,
82
+ # but a successful execution with no errors means the data is there.
83
+ except Exception as e:
84
+ print("\n--- Error during upload ---")
85
+ print(f"An error occurred while uploading to Supabase: {e}")
86
+ print("Please check your Supabase table schema and permissions.")
87
+
88
+ if __name__ == "__main__":
89
+ upload_data_to_supabase()