Futuresony commited on
Commit
6d988fa
·
verified ·
1 Parent(s): 1d18882

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -263
app.py DELETED
@@ -1,263 +0,0 @@
1
- import psycopg2
2
- import os
3
- import pickle
4
- import traceback
5
- import numpy as np
6
- import json
7
- import base64
8
- import time
9
-
10
- # Assuming gspread and SentenceTransformer are installed
11
- try:
12
- import gspread
13
- from oauth2client.service_account import ServiceAccountCredentials
14
- from sentence_transformers import SentenceTransformer
15
- print("gspread and SentenceTransformer imported successfully.")
16
- except ImportError:
17
- print("Error: Required libraries (gspread, oauth2client, sentence_transformers) not found.")
18
- print("Please install them: pip install psycopg2-binary gspread oauth2client sentence-transformers numpy")
19
- # Exit or handle the error appropriately if libraries are missing
20
- exit() # Exiting for demonstration if imports fail
21
-
22
- # Define environment variables for PostgreSQL connection
23
- # These should be set in the environment where you run this script
24
- DB_HOST = os.getenv("DB_HOST")
25
- DB_NAME = os.getenv("DB_NAME")
26
- DB_USER = os.getenv("DB_USER")
27
- DB_PASSWORD = os.getenv("DB_PASSWORD")
28
- DB_PORT = os.getenv("DB_PORT", "5432") # Default PostgreSQL port
29
-
30
- # Define environment variables for Google Sheets authentication
31
- GOOGLE_BASE64_CREDENTIALS = os.getenv("GOOGLE_BASE64_CREDENTIALS")
32
- SHEET_ID = "19ipxC2vHYhpXCefpxpIkpeYdI43a1Ku2kYwecgUULIw" # Replace with your actual Sheet ID
33
-
34
- # Define table names
35
- BUSINESS_DATA_TABLE = "business_data"
36
- CONVERSATION_HISTORY_TABLE = "conversation_history"
37
-
38
- # Define Embedding Dimension (must match your chosen Sentence Transformer model)
39
- EMBEDDING_DIM = 384 # Dimension for paraphrase-MiniLM-L6-v2
40
-
41
- # --- Database Functions ---
42
- def connect_db():
43
- """Establishes a connection to the PostgreSQL database."""
44
- print("Attempting to connect to the database...")
45
- if not all([DB_HOST, DB_NAME, DB_USER, DB_PASSWORD]):
46
- print("Error: Database credentials (DB_HOST, DB_NAME, DB_USER, DB_PASSWORD) are not fully set as environment variables.")
47
- return None
48
- try:
49
- conn = psycopg2.connect(
50
- host=DB_HOST,
51
- database=DB_NAME,
52
- user=DB_USER,
53
- password=DB_PASSWORD,
54
- port=DB_PORT
55
- )
56
- print("Database connection successful.")
57
- return conn
58
- except Exception as e:
59
- print(f"Error connecting to the database: {e}")
60
- print(traceback.format_exc())
61
- return None
62
-
63
- def setup_db_schema(conn):
64
- """Sets up the necessary tables and pgvector extension."""
65
- print("Setting up database schema...")
66
- try:
67
- with conn.cursor() as cur:
68
- # Enable pgvector extension
69
- cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
70
- print("pgvector extension enabled (if not already).")
71
-
72
- # Create business_data table
73
- cur.execute(f"""
74
- CREATE TABLE IF NOT EXISTS {BUSINESS_DATA_TABLE} (
75
- id SERIAL PRIMARY KEY,
76
- service TEXT NOT NULL,
77
- description TEXT NOT NULL,
78
- embedding vector({EMBEDDING_DIM}) -- Assuming EMBEDDING_DIM is defined globally
79
- );
80
- """)
81
- print(f"Table '{BUSINESS_DATA_TABLE}' created (if not already).")
82
-
83
- # Create conversation_history table
84
- cur.execute(f"""
85
- CREATE TABLE IF NOT EXISTS {CONVERSATION_HISTORY_TABLE} (
86
- id SERIAL PRIMARY KEY,
87
- timestamp TIMESTAMP WITH TIME ZONE NOT NULL,
88
- user_id TEXT,
89
- user_query TEXT,
90
- model_response TEXT,
91
- tool_details JSONB,
92
- model_used TEXT
93
- );
94
- """)
95
- print(f"Table '{CONVERSATION_HISTORY_TABLE}' created (if not already).")
96
-
97
- conn.commit()
98
- print("Database schema setup complete.")
99
- return True
100
- except Exception as e:
101
- print(f"Error setting up database schema: {e}")
102
- print(traceback.format_exc())
103
- conn.rollback()
104
- return False
105
-
106
- # --- Google Sheets Authentication and Data Retrieval ---
107
- def authenticate_google_sheets():
108
- """Authenticates with Google Sheets using base64 encoded credentials."""
109
- print("Authenticating Google Account for Sheets access...")
110
- if not GOOGLE_BASE64_CREDENTIALS:
111
- print("Error: GOOGLE_BASE64_CREDENTIALS environment variable not set. Google Sheets access will fail.")
112
- return None
113
- try:
114
- credentials_json = base64.b64decode(GOOGLE_BASE64_CREDENTIALS).decode('utf-8')
115
- credentials = json.loads(credentials_json)
116
- # Use ServiceAccountCredentials.from_json_keyfile_dict for dictionary
117
- scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
118
- creds = ServiceAccountCredentials.from_json_keyfile_dict(credentials, scope)
119
- gc = gspread.authorize(creds)
120
- print("Google Sheets authentication successful.")
121
- return gc
122
- except Exception as e:
123
- print(f"Google Sheets authentication failed: {e}")
124
- print(traceback.format_exc())
125
- print("Please ensure your GOOGLE_BASE64_CREDENTIALS environment variable is correctly set and contains valid service account credentials.")
126
- return None
127
-
128
- # --- Data Migration Function ---
129
- def migrate_google_sheet_data_to_db(conn, gc_client, embedder_model):
130
- """Retrieves data from Google Sheet, generates embeddings, and inserts into DB."""
131
- print("Migrating data from Google Sheet to database...")
132
- if gc_client is None or SHEET_ID is None:
133
- print("Skipping Google Sheet migration: Google Sheets client or Sheet ID not available.")
134
- return False
135
- if embedder_model is None:
136
- print("Skipping Google Sheet migration: Embedder not available.")
137
- return False
138
- if EMBEDDING_DIM is None:
139
- print("Skipping Google Sheet migration: EMBEDDING_DIM not defined.")
140
- return False
141
-
142
- try:
143
- # Check if business_data table is already populated
144
- with conn.cursor() as cur:
145
- cur.execute(f"SELECT COUNT(*) FROM {BUSINESS_DATA_TABLE};")
146
- count = cur.fetchone()[0]
147
- if count > 0:
148
- print(f"Table '{BUSINESS_DATA_TABLE}' already contains {count} records. Skipping migration.")
149
- return True # Indicate success because data is already there
150
-
151
- sheet = gc_client.open_by_key(SHEET_ID).sheet1
152
- print(f"Successfully opened Google Sheet with ID: {SHEET_ID}")
153
- data_records = sheet.get_all_records()
154
-
155
- if not data_records:
156
- print("No data records found in Google Sheet.")
157
- return False
158
-
159
- filtered_data = [row for row in data_records if row.get('Service') and row.get('Description')]
160
- if not filtered_data:
161
- print("Filtered data is empty after checking for 'Service' and 'Description'.")
162
- return False
163
-
164
- print(f"Processing {len(filtered_data)} records for migration.")
165
- descriptions_for_embedding = [f"Service: {row['Service'].strip()}. Description: {row['Description'].strip()}" for row in filtered_data]
166
-
167
- # Generate embeddings in batches if needed for large datasets
168
- batch_size = 64
169
- embeddings_list = []
170
- for i in range(0, len(descriptions_for_embedding), batch_size):
171
- batch_descriptions = descriptions_for_embedding[i:i + batch_size]
172
- print(f"Encoding batch {int(i/batch_size) + 1} of {int(len(descriptions_for_embedding)/batch_size) + 1}...")
173
- batch_embeddings = embedder_model.encode(batch_descriptions, convert_to_tensor=False)
174
- embeddings_list.extend(batch_embeddings.tolist()) # Convert numpy array to list
175
-
176
- insert_count = 0
177
- with conn.cursor() as cur:
178
- for i, row in enumerate(filtered_data):
179
- service = row.get('Service', '').strip()
180
- description = row.get('Description', '').strip()
181
- embedding = embeddings_list[i]
182
-
183
- # Use the vector literal format '[]' for inserting embeddings
184
- # Use execute_values for potentially faster bulk inserts if necessary, but simple execute is fine for this
185
- cur.execute(f"""
186
- INSERT INTO {BUSINESS_DATA_TABLE} (service, description, embedding)
187
- VALUES (%s, %s, %s::vector);
188
- """, (service, description, embedding))
189
- insert_count += 1
190
- if insert_count % 100 == 0:
191
- conn.commit() # Commit periodically
192
- print(f"Inserted {insert_count} records...")
193
-
194
- conn.commit() # Commit remaining records
195
- print(f"Migration complete. Inserted {insert_count} records into '{BUSINESS_DATA_TABLE}'.")
196
- return True
197
-
198
- except Exception as e:
199
- print(f"Error during Google Sheet data migration: {e}")
200
- print(traceback.format_exc())
201
- conn.rollback()
202
- return False
203
-
204
- # --- Main Migration Execution ---
205
- if __name__ == "__main__":
206
- print("Starting RAG data migration script...")
207
-
208
- # 1. Authenticate Google Sheets
209
- gc = authenticate_google_sheets()
210
- if gc is None:
211
- print("Google Sheets authentication failed. Cannot migrate data from Sheets.")
212
- # Exit or handle the error if Sheets auth fails
213
- exit()
214
-
215
- # 2. Initialize Embedder Model
216
- try:
217
- print(f"Loading Sentence Transformer model for embeddings (dimension: {EMBEDDING_DIM})...")
218
- # Make sure to use the correct model and check its dimension
219
- embedder = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
220
- # Verify the dimension matches EMBEDDING_DIM
221
- if embedder.get_sentence_embedding_dimension() != EMBEDDING_DIM:
222
- print(f"Error: Loaded embedder dimension ({embedder.get_sentence_embedding_dimension()}) does not match expected EMBEDDING_DIM ({EMBEDDING_DIM}).")
223
- print("Please check the model or update EMBEDDING_DIM.")
224
- embedder = None # Set to None to prevent migration with wrong dimension
225
- else:
226
- print("Embedder model loaded successfully.")
227
-
228
- except Exception as e:
229
- print(f"Error loading Sentence Transformer model: {e}")
230
- print(traceback.format_exc())
231
- embedder = None # Set to None if model loading fails
232
-
233
- if embedder is None:
234
- print("Embedder model not available. Cannot generate embeddings for migration.")
235
- # Exit or handle the error if embedder fails to load
236
- exit()
237
-
238
-
239
- # 3. Connect to Database
240
- db_conn = connect_db()
241
- if db_conn is None:
242
- print("Database connection failed. Cannot migrate data.")
243
- # Exit or handle the error if DB connection fails
244
- exit()
245
-
246
- try:
247
- # 4. Setup Database Schema (if not already done)
248
- if setup_db_schema(db_conn):
249
- # 5. Migrate Data
250
- if migrate_google_sheet_data_to_db(db_conn, gc, embedder):
251
- print("\nRAG Data Migration to PostgreSQL completed successfully.")
252
- else:
253
- print("\nRAG Data Migration to PostgreSQL failed.")
254
- else:
255
- print("\nDatabase schema setup failed. Data migration skipped.")
256
-
257
- finally:
258
- # 6. Close Database Connection
259
- if db_conn:
260
- db_conn.close()
261
- print("Database connection closed.")
262
-
263
- print("\nMigration script finished.")