Spaces:
Sleeping
Sleeping
import os | |
import pickle | |
import pandas as pd | |
from utils.embedding_generation import compute_doc_embeddings | |
def load_database(file_path: str) -> pd.DataFrame: | |
""" | |
Loads a CSV file into a Pandas DataFrame and sets the index to the 'service' column. | |
Args: | |
file_path (str): Path to the CSV file. | |
Returns: | |
pd.DataFrame: DataFrame with 'service' as the index. | |
""" | |
try: | |
df = pd.read_csv(file_path) | |
df = df.set_index("service") # Set 'service' as index | |
return df | |
except FileNotFoundError: | |
print(f"Error: The file '{file_path}' was not found.") | |
return pd.DataFrame() # Return an empty DataFrame on error | |
except Exception as e: | |
print(f"Error loading CSV file '{file_path}': {e}") | |
return pd.DataFrame() | |
def load_pickle(file_path: str): | |
""" | |
Loads and returns data from a Pickle (.pkl) file. | |
Args: | |
file_path (str): Path to the Pickle file. | |
Returns: | |
object: The data loaded from the Pickle file, or None if loading failed. | |
""" | |
try: | |
with open(file_path, "rb") as file: # Open in 'rb' (read binary) mode | |
return pickle.load(file) | |
except FileNotFoundError: | |
print(f"Error: The file '{file_path}' was not found.") | |
return None | |
except Exception as e: | |
print(f"Error reading Pickle file '{file_path}': {e}") | |
return None | |
def load_file(file_path: str) -> str: | |
""" | |
Reads the text from a file safely. | |
Args: | |
file_path (str): Path to the text file. | |
Returns: | |
str: The content of the file, or an empty string if an error occurred. | |
""" | |
try: | |
with open(file_path, "r", encoding="utf-8") as file: | |
return file.read() | |
except FileNotFoundError: | |
print(f"Error: The file '{file_path}' was not found.") | |
return "" | |
except Exception as e: | |
print(f"Error reading file '{file_path}': {e}") | |
return "" | |
# def load_timestamp(file_path: str) -> float: | |
# """ | |
# Loads the timestamp from a file. | |
# Args: | |
# file_path (str): The file path from which the timestamp will be read. | |
# Returns: | |
# float: The timestamp read from the file. Returns 0.0 if there is an error or no timestamp is found. | |
# """ | |
# timestamp_str = load_file(file_path) # Use load_file function to read the file content | |
# try: | |
# return float(timestamp_str) # Convert the string to a float | |
# except ValueError: | |
# print(f"Error: The content in '{file_path}' is not a valid float.") | |
# return 0.0 # Return a default value if the content is not valid | |
# def save_timestamp(timestamp: float, file_path: str): | |
# """ | |
# Saves the timestamp to a file to persist across sessions. | |
# Args: | |
# timestamp (float): The timestamp representing the last update time of the database. | |
# file_path (str): The file path where the timestamp will be stored. | |
# Returns: | |
# None | |
# """ | |
# try: | |
# with open(file_path, 'w') as f: | |
# f.write(str(timestamp)) # Convert timestamp to string before saving | |
# except Exception as e: | |
# print(f"Error saving timestamp: {e}") | |
# def save_pickle(embeddings: dict, file_path: str) -> None: | |
# """ | |
# Saves to a pickle file safely. | |
# Args: | |
# embeddings (dict): The embeddings to be saved. | |
# file_path (str): The file path where the embeddings will be saved. | |
# Returns: | |
# None | |
# """ | |
# try: | |
# with open(file_path, "wb") as file: | |
# pickle.dump(embeddings, file) | |
# except Exception as e: | |
# print(f"Error saving embeddings to '{file_path}': {e}") | |
def update_embeddings(database:pd.DataFrame, embeddings_filepath: str): | |
""" | |
Generates new embeddings for the updated database and saves them as a pickle file. | |
Args: | |
database (pd.DataFrame): The updated database (e.g., a DataFrame). | |
embeddings_filepath (str): The file path where the embeddings will be saved. | |
Returns: | |
database_embeddings: The newly generated embeddings for the database. | |
""" | |
# Compute embeddings for the updated database | |
database_embeddings = compute_doc_embeddings(database) | |
# # Save the newly computed embeddings to a pickle file | |
# save_pickle(database_embeddings, embeddings_filepath) | |
return database_embeddings | |
def load_embeddings(database, database_filepath, embeddings_filepath): | |
""" | |
Loads embeddings for the given database. If the database has been updated | |
since the last time embeddings were generated, new embeddings are created | |
and saved. If the database hasn't changed, previously saved embeddings are loaded. | |
Args: | |
database (pd.DataFrame): The database (e.g., a DataFrame) for which embeddings need to be generated or loaded. | |
database_filepath (str): The file path of the database (CSV file or similar). | |
embeddings_filepath (str): The file path where the embeddings are saved (pickle file). | |
Returns: | |
database_embeddings: The embeddings for the database, either newly generated or loaded from the pickle file. | |
""" | |
# # Get the timestamp of the last modification of the database file | |
# database_timestamp = os.path.getmtime(database_filepath) | |
# # Get the stored timestamp of the last database for which embeddings were generated | |
# timestamp_filepath = '/home/user/app/data/db_update_timestamp.txt' | |
# previous_timestamp = load_timestamp(timestamp_filepath) | |
# print("Prev timestamp", previous_timestamp) | |
# print("DB timestamp", database_timestamp) | |
# # Check if the timestamp of the database file is different from the stored timestamp (DB_UPDATE_TIMESTAMP) | |
# if database_timestamp == previous_timestamp: | |
# # If the database file has not been updated, load the existing embeddings from the pickle file | |
# database_embeddings = load_pickle(embeddings_filepath) | |
# print("Embeddings loaded.") | |
# else: | |
# print("Embeddings updating.....") | |
# # If the database file has been updated, generate new embeddings and save them to the embeddings file | |
# database_embeddings = update_embeddings(database, embeddings_filepath) | |
# # Update the stored timestamp | |
# save_timestamp(database_timestamp, timestamp_filepath) | |
# print("Embeddings updated.") | |
print("Embeddings updating.....") | |
database_embeddings = update_embeddings(database, embeddings_filepath) | |
print("Embeddings updated.") | |
return database_embeddings | |