Spaces:
Build error
Build error
Delete app.py
Browse files
app.py
DELETED
@@ -1,263 +0,0 @@
|
|
1 |
-
import psycopg2
|
2 |
-
import os
|
3 |
-
import pickle
|
4 |
-
import traceback
|
5 |
-
import numpy as np
|
6 |
-
import json
|
7 |
-
import base64
|
8 |
-
import time
|
9 |
-
|
10 |
-
# Assuming gspread and SentenceTransformer are installed
|
11 |
-
try:
|
12 |
-
import gspread
|
13 |
-
from oauth2client.service_account import ServiceAccountCredentials
|
14 |
-
from sentence_transformers import SentenceTransformer
|
15 |
-
print("gspread and SentenceTransformer imported successfully.")
|
16 |
-
except ImportError:
|
17 |
-
print("Error: Required libraries (gspread, oauth2client, sentence_transformers) not found.")
|
18 |
-
print("Please install them: pip install psycopg2-binary gspread oauth2client sentence-transformers numpy")
|
19 |
-
# Exit or handle the error appropriately if libraries are missing
|
20 |
-
exit() # Exiting for demonstration if imports fail
|
21 |
-
|
22 |
-
# Define environment variables for PostgreSQL connection
|
23 |
-
# These should be set in the environment where you run this script
|
24 |
-
DB_HOST = os.getenv("DB_HOST")
|
25 |
-
DB_NAME = os.getenv("DB_NAME")
|
26 |
-
DB_USER = os.getenv("DB_USER")
|
27 |
-
DB_PASSWORD = os.getenv("DB_PASSWORD")
|
28 |
-
DB_PORT = os.getenv("DB_PORT", "5432") # Default PostgreSQL port
|
29 |
-
|
30 |
-
# Define environment variables for Google Sheets authentication
|
31 |
-
GOOGLE_BASE64_CREDENTIALS = os.getenv("GOOGLE_BASE64_CREDENTIALS")
|
32 |
-
SHEET_ID = "19ipxC2vHYhpXCefpxpIkpeYdI43a1Ku2kYwecgUULIw" # Replace with your actual Sheet ID
|
33 |
-
|
34 |
-
# Define table names
|
35 |
-
BUSINESS_DATA_TABLE = "business_data"
|
36 |
-
CONVERSATION_HISTORY_TABLE = "conversation_history"
|
37 |
-
|
38 |
-
# Define Embedding Dimension (must match your chosen Sentence Transformer model)
|
39 |
-
EMBEDDING_DIM = 384 # Dimension for paraphrase-MiniLM-L6-v2
|
40 |
-
|
41 |
-
# --- Database Functions ---
|
42 |
-
def connect_db():
|
43 |
-
"""Establishes a connection to the PostgreSQL database."""
|
44 |
-
print("Attempting to connect to the database...")
|
45 |
-
if not all([DB_HOST, DB_NAME, DB_USER, DB_PASSWORD]):
|
46 |
-
print("Error: Database credentials (DB_HOST, DB_NAME, DB_USER, DB_PASSWORD) are not fully set as environment variables.")
|
47 |
-
return None
|
48 |
-
try:
|
49 |
-
conn = psycopg2.connect(
|
50 |
-
host=DB_HOST,
|
51 |
-
database=DB_NAME,
|
52 |
-
user=DB_USER,
|
53 |
-
password=DB_PASSWORD,
|
54 |
-
port=DB_PORT
|
55 |
-
)
|
56 |
-
print("Database connection successful.")
|
57 |
-
return conn
|
58 |
-
except Exception as e:
|
59 |
-
print(f"Error connecting to the database: {e}")
|
60 |
-
print(traceback.format_exc())
|
61 |
-
return None
|
62 |
-
|
63 |
-
def setup_db_schema(conn):
|
64 |
-
"""Sets up the necessary tables and pgvector extension."""
|
65 |
-
print("Setting up database schema...")
|
66 |
-
try:
|
67 |
-
with conn.cursor() as cur:
|
68 |
-
# Enable pgvector extension
|
69 |
-
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
|
70 |
-
print("pgvector extension enabled (if not already).")
|
71 |
-
|
72 |
-
# Create business_data table
|
73 |
-
cur.execute(f"""
|
74 |
-
CREATE TABLE IF NOT EXISTS {BUSINESS_DATA_TABLE} (
|
75 |
-
id SERIAL PRIMARY KEY,
|
76 |
-
service TEXT NOT NULL,
|
77 |
-
description TEXT NOT NULL,
|
78 |
-
embedding vector({EMBEDDING_DIM}) -- Assuming EMBEDDING_DIM is defined globally
|
79 |
-
);
|
80 |
-
""")
|
81 |
-
print(f"Table '{BUSINESS_DATA_TABLE}' created (if not already).")
|
82 |
-
|
83 |
-
# Create conversation_history table
|
84 |
-
cur.execute(f"""
|
85 |
-
CREATE TABLE IF NOT EXISTS {CONVERSATION_HISTORY_TABLE} (
|
86 |
-
id SERIAL PRIMARY KEY,
|
87 |
-
timestamp TIMESTAMP WITH TIME ZONE NOT NULL,
|
88 |
-
user_id TEXT,
|
89 |
-
user_query TEXT,
|
90 |
-
model_response TEXT,
|
91 |
-
tool_details JSONB,
|
92 |
-
model_used TEXT
|
93 |
-
);
|
94 |
-
""")
|
95 |
-
print(f"Table '{CONVERSATION_HISTORY_TABLE}' created (if not already).")
|
96 |
-
|
97 |
-
conn.commit()
|
98 |
-
print("Database schema setup complete.")
|
99 |
-
return True
|
100 |
-
except Exception as e:
|
101 |
-
print(f"Error setting up database schema: {e}")
|
102 |
-
print(traceback.format_exc())
|
103 |
-
conn.rollback()
|
104 |
-
return False
|
105 |
-
|
106 |
-
# --- Google Sheets Authentication and Data Retrieval ---
|
107 |
-
def authenticate_google_sheets():
|
108 |
-
"""Authenticates with Google Sheets using base64 encoded credentials."""
|
109 |
-
print("Authenticating Google Account for Sheets access...")
|
110 |
-
if not GOOGLE_BASE64_CREDENTIALS:
|
111 |
-
print("Error: GOOGLE_BASE64_CREDENTIALS environment variable not set. Google Sheets access will fail.")
|
112 |
-
return None
|
113 |
-
try:
|
114 |
-
credentials_json = base64.b64decode(GOOGLE_BASE64_CREDENTIALS).decode('utf-8')
|
115 |
-
credentials = json.loads(credentials_json)
|
116 |
-
# Use ServiceAccountCredentials.from_json_keyfile_dict for dictionary
|
117 |
-
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
|
118 |
-
creds = ServiceAccountCredentials.from_json_keyfile_dict(credentials, scope)
|
119 |
-
gc = gspread.authorize(creds)
|
120 |
-
print("Google Sheets authentication successful.")
|
121 |
-
return gc
|
122 |
-
except Exception as e:
|
123 |
-
print(f"Google Sheets authentication failed: {e}")
|
124 |
-
print(traceback.format_exc())
|
125 |
-
print("Please ensure your GOOGLE_BASE64_CREDENTIALS environment variable is correctly set and contains valid service account credentials.")
|
126 |
-
return None
|
127 |
-
|
128 |
-
# --- Data Migration Function ---
|
129 |
-
def migrate_google_sheet_data_to_db(conn, gc_client, embedder_model):
|
130 |
-
"""Retrieves data from Google Sheet, generates embeddings, and inserts into DB."""
|
131 |
-
print("Migrating data from Google Sheet to database...")
|
132 |
-
if gc_client is None or SHEET_ID is None:
|
133 |
-
print("Skipping Google Sheet migration: Google Sheets client or Sheet ID not available.")
|
134 |
-
return False
|
135 |
-
if embedder_model is None:
|
136 |
-
print("Skipping Google Sheet migration: Embedder not available.")
|
137 |
-
return False
|
138 |
-
if EMBEDDING_DIM is None:
|
139 |
-
print("Skipping Google Sheet migration: EMBEDDING_DIM not defined.")
|
140 |
-
return False
|
141 |
-
|
142 |
-
try:
|
143 |
-
# Check if business_data table is already populated
|
144 |
-
with conn.cursor() as cur:
|
145 |
-
cur.execute(f"SELECT COUNT(*) FROM {BUSINESS_DATA_TABLE};")
|
146 |
-
count = cur.fetchone()[0]
|
147 |
-
if count > 0:
|
148 |
-
print(f"Table '{BUSINESS_DATA_TABLE}' already contains {count} records. Skipping migration.")
|
149 |
-
return True # Indicate success because data is already there
|
150 |
-
|
151 |
-
sheet = gc_client.open_by_key(SHEET_ID).sheet1
|
152 |
-
print(f"Successfully opened Google Sheet with ID: {SHEET_ID}")
|
153 |
-
data_records = sheet.get_all_records()
|
154 |
-
|
155 |
-
if not data_records:
|
156 |
-
print("No data records found in Google Sheet.")
|
157 |
-
return False
|
158 |
-
|
159 |
-
filtered_data = [row for row in data_records if row.get('Service') and row.get('Description')]
|
160 |
-
if not filtered_data:
|
161 |
-
print("Filtered data is empty after checking for 'Service' and 'Description'.")
|
162 |
-
return False
|
163 |
-
|
164 |
-
print(f"Processing {len(filtered_data)} records for migration.")
|
165 |
-
descriptions_for_embedding = [f"Service: {row['Service'].strip()}. Description: {row['Description'].strip()}" for row in filtered_data]
|
166 |
-
|
167 |
-
# Generate embeddings in batches if needed for large datasets
|
168 |
-
batch_size = 64
|
169 |
-
embeddings_list = []
|
170 |
-
for i in range(0, len(descriptions_for_embedding), batch_size):
|
171 |
-
batch_descriptions = descriptions_for_embedding[i:i + batch_size]
|
172 |
-
print(f"Encoding batch {int(i/batch_size) + 1} of {int(len(descriptions_for_embedding)/batch_size) + 1}...")
|
173 |
-
batch_embeddings = embedder_model.encode(batch_descriptions, convert_to_tensor=False)
|
174 |
-
embeddings_list.extend(batch_embeddings.tolist()) # Convert numpy array to list
|
175 |
-
|
176 |
-
insert_count = 0
|
177 |
-
with conn.cursor() as cur:
|
178 |
-
for i, row in enumerate(filtered_data):
|
179 |
-
service = row.get('Service', '').strip()
|
180 |
-
description = row.get('Description', '').strip()
|
181 |
-
embedding = embeddings_list[i]
|
182 |
-
|
183 |
-
# Use the vector literal format '[]' for inserting embeddings
|
184 |
-
# Use execute_values for potentially faster bulk inserts if necessary, but simple execute is fine for this
|
185 |
-
cur.execute(f"""
|
186 |
-
INSERT INTO {BUSINESS_DATA_TABLE} (service, description, embedding)
|
187 |
-
VALUES (%s, %s, %s::vector);
|
188 |
-
""", (service, description, embedding))
|
189 |
-
insert_count += 1
|
190 |
-
if insert_count % 100 == 0:
|
191 |
-
conn.commit() # Commit periodically
|
192 |
-
print(f"Inserted {insert_count} records...")
|
193 |
-
|
194 |
-
conn.commit() # Commit remaining records
|
195 |
-
print(f"Migration complete. Inserted {insert_count} records into '{BUSINESS_DATA_TABLE}'.")
|
196 |
-
return True
|
197 |
-
|
198 |
-
except Exception as e:
|
199 |
-
print(f"Error during Google Sheet data migration: {e}")
|
200 |
-
print(traceback.format_exc())
|
201 |
-
conn.rollback()
|
202 |
-
return False
|
203 |
-
|
204 |
-
# --- Main Migration Execution ---
|
205 |
-
if __name__ == "__main__":
|
206 |
-
print("Starting RAG data migration script...")
|
207 |
-
|
208 |
-
# 1. Authenticate Google Sheets
|
209 |
-
gc = authenticate_google_sheets()
|
210 |
-
if gc is None:
|
211 |
-
print("Google Sheets authentication failed. Cannot migrate data from Sheets.")
|
212 |
-
# Exit or handle the error if Sheets auth fails
|
213 |
-
exit()
|
214 |
-
|
215 |
-
# 2. Initialize Embedder Model
|
216 |
-
try:
|
217 |
-
print(f"Loading Sentence Transformer model for embeddings (dimension: {EMBEDDING_DIM})...")
|
218 |
-
# Make sure to use the correct model and check its dimension
|
219 |
-
embedder = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
|
220 |
-
# Verify the dimension matches EMBEDDING_DIM
|
221 |
-
if embedder.get_sentence_embedding_dimension() != EMBEDDING_DIM:
|
222 |
-
print(f"Error: Loaded embedder dimension ({embedder.get_sentence_embedding_dimension()}) does not match expected EMBEDDING_DIM ({EMBEDDING_DIM}).")
|
223 |
-
print("Please check the model or update EMBEDDING_DIM.")
|
224 |
-
embedder = None # Set to None to prevent migration with wrong dimension
|
225 |
-
else:
|
226 |
-
print("Embedder model loaded successfully.")
|
227 |
-
|
228 |
-
except Exception as e:
|
229 |
-
print(f"Error loading Sentence Transformer model: {e}")
|
230 |
-
print(traceback.format_exc())
|
231 |
-
embedder = None # Set to None if model loading fails
|
232 |
-
|
233 |
-
if embedder is None:
|
234 |
-
print("Embedder model not available. Cannot generate embeddings for migration.")
|
235 |
-
# Exit or handle the error if embedder fails to load
|
236 |
-
exit()
|
237 |
-
|
238 |
-
|
239 |
-
# 3. Connect to Database
|
240 |
-
db_conn = connect_db()
|
241 |
-
if db_conn is None:
|
242 |
-
print("Database connection failed. Cannot migrate data.")
|
243 |
-
# Exit or handle the error if DB connection fails
|
244 |
-
exit()
|
245 |
-
|
246 |
-
try:
|
247 |
-
# 4. Setup Database Schema (if not already done)
|
248 |
-
if setup_db_schema(db_conn):
|
249 |
-
# 5. Migrate Data
|
250 |
-
if migrate_google_sheet_data_to_db(db_conn, gc, embedder):
|
251 |
-
print("\nRAG Data Migration to PostgreSQL completed successfully.")
|
252 |
-
else:
|
253 |
-
print("\nRAG Data Migration to PostgreSQL failed.")
|
254 |
-
else:
|
255 |
-
print("\nDatabase schema setup failed. Data migration skipped.")
|
256 |
-
|
257 |
-
finally:
|
258 |
-
# 6. Close Database Connection
|
259 |
-
if db_conn:
|
260 |
-
db_conn.close()
|
261 |
-
print("Database connection closed.")
|
262 |
-
|
263 |
-
print("\nMigration script finished.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|