# upload_data.py import os import json from dotenv import load_dotenv from langchain_huggingface import HuggingFaceEmbeddings from supabase.client import Client, create_client def upload_data_to_supabase(): """ Reads data from metadata.jsonl, generates embeddings, and uploads it to a Supabase table named 'documents'. """ # --- 1. Load Environment and Configuration --- print("Loading configuration...") load_dotenv() supabase_url = os.environ.get("SUPABASE_URL") supabase_key = os.environ.get("SUPABASE_SERVICE_KEY") if not supabase_url or not supabase_key: print("Error: SUPABASE_URL and SUPABASE_SERVICE_KEY must be set in your .env file.") return # --- 2. Load the Local Data --- print("Loading data from metadata.jsonl...") try: with open('metadata.jsonl', 'r', encoding='utf-8') as jsonl_file: json_list = list(jsonl_file) json_QA = [] for json_str in json_list: json_QA.append(json.loads(json_str)) print(f"Successfully loaded {len(json_QA)} records from metadata.jsonl.") except FileNotFoundError: print("Error: metadata.jsonl not found. Make sure it is in the same directory.") return except Exception as e: print(f"Error reading metadata.jsonl: {e}") return # --- 3. Initialize Supabase Client and Embeddings Model --- print("Connecting to Supabase and initializing embeddings model...") try: supabase: Client = create_client(supabase_url, supabase_key) embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") print("Connection and initialization successful.") except Exception as e: print(f"Error during initialization: {e}") return # --- 4. Prepare Documents for Upload --- print("Preparing documents and generating embeddings. This may take a few minutes...") docs_to_upload = [] for i, sample in enumerate(json_QA): # Create the main content string content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}" # Create the vector embedding for the content embedding = embeddings.embed_query(content) # Create the structured document for upload doc = { "content": content, "metadata": {"source": sample['task_id']}, # This is now a proper JSON object "embedding": embedding } docs_to_upload.append(doc) # Optional: Print progress if (i + 1) % 10 == 0: print(f"Processed {i + 1}/{len(json_QA)} documents...") print("All documents have been processed.") # --- 5. Upload to Supabase --- print("Uploading documents to Supabase...") try: response = supabase.table("documents").insert(docs_to_upload).execute() print("\n--- Success! ---") print(f"Successfully uploaded {len(docs_to_upload)} documents to your Supabase table.") # The 'response' object from Supabase V2 doesn't contain a simple count, # but a successful execution with no errors means the data is there. except Exception as e: print("\n--- Error during upload ---") print(f"An error occurred while uploading to Supabase: {e}") print("Please check your Supabase table schema and permissions.") if __name__ == "__main__": upload_data_to_supabase()