File size: 6,649 Bytes
cfb87a4
1a8750d
 
9014f4d
1a8750d
cfb87a4
1a8750d
 
 
cfb87a4
1a8750d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cfb87a4
 
 
 
 
 
 
 
 
1a8750d
 
 
 
 
 
 
 
 
9014f4d
1a8750d
cfb87a4
 
 
 
 
 
 
 
 
1a8750d
 
 
 
 
 
 
 
3c86168
 
9014f4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edd3ed5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9014f4d
edd3ed5
9014f4d
edd3ed5
 
9014f4d
 
edd3ed5
 
9014f4d
edd3ed5
 
9014f4d
 
 
 
edd3ed5
9014f4d
 
 
edd3ed5
9014f4d
 
 
edd3ed5
9014f4d
 
 
 
 
edd3ed5
9014f4d
 
edd3ed5
9014f4d
cfb87a4
9014f4d
cfb87a4
 
 
 
 
 
 
 
 
 
 
 
 
edd3ed5
 
9014f4d
edd3ed5
 
 
5fa76e3
 
cfb87a4
edd3ed5
 
 
 
 
 
 
 
 
461d4fe
edd3ed5
 
 
9014f4d
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import os
import pickle  
import pandas as pd
from utils.embedding_generation import compute_doc_embeddings

def load_database(file_path: str) -> pd.DataFrame:
    """
    Loads a CSV file into a Pandas DataFrame and sets the index to the 'service' column.

    Args:
        file_path (str): Path to the CSV file.

    Returns:
        pd.DataFrame: DataFrame with 'service' as the index.
    """
    try:
        df = pd.read_csv(file_path)
        df = df.set_index("service")  # Set 'service' as index
        return df
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        return pd.DataFrame()  # Return an empty DataFrame on error
    except Exception as e:
        print(f"Error loading CSV file '{file_path}': {e}")
        return pd.DataFrame()
        
def load_pickle(file_path: str):
    """
    Loads and returns data from a Pickle (.pkl) file.

    Args:
        file_path (str): Path to the Pickle file.

    Returns:
        object: The data loaded from the Pickle file, or None if loading failed.
    """
    try:
        with open(file_path, "rb") as file:  # Open in 'rb' (read binary) mode
            return pickle.load(file)
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        return None
    except Exception as e:
        print(f"Error reading Pickle file '{file_path}': {e}")
        return None
        
def load_file(file_path: str) -> str:
    """
    Reads the text from a file safely.

    Args:
        file_path (str): Path to the text file.

    Returns:
        str: The content of the file, or an empty string if an error occurred.
    """
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read()
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        return ""
    except Exception as e:
        print(f"Error reading file '{file_path}': {e}")
        return ""

# def load_timestamp(file_path: str) -> float:
#     """
#     Loads the timestamp from a file.

#     Args:
#         file_path (str): The file path from which the timestamp will be read.

#     Returns:
#         float: The timestamp read from the file. Returns 0.0 if there is an error or no timestamp is found.
#     """
#     timestamp_str = load_file(file_path)  # Use load_file function to read the file content
#     try:
#         return float(timestamp_str)  # Convert the string to a float
#     except ValueError:
#         print(f"Error: The content in '{file_path}' is not a valid float.")
#         return 0.0  # Return a default value if the content is not valid

# def save_timestamp(timestamp: float, file_path: str):
#     """
#     Saves the timestamp to a file to persist across sessions.

#     Args:
#         timestamp (float): The timestamp representing the last update time of the database.
#         file_path (str): The file path where the timestamp will be stored.

#     Returns:
#         None
#     """
#     try:
#         with open(file_path, 'w') as f:
#             f.write(str(timestamp))  # Convert timestamp to string before saving
#     except Exception as e:
#         print(f"Error saving timestamp: {e}")

# def save_pickle(embeddings: dict, file_path: str) -> None:
#     """
#     Saves to a pickle file safely.

#     Args:
#         embeddings (dict): The embeddings to be saved.
#         file_path (str): The file path where the embeddings will be saved.

#     Returns:
#         None
#     """
#     try:
#         with open(file_path, "wb") as file:
#             pickle.dump(embeddings, file)
#     except Exception as e:
#         print(f"Error saving embeddings to '{file_path}': {e}")

def update_embeddings(database:pd.DataFrame, embeddings_filepath: str):
    """
    Generates new embeddings for the updated database and saves them as a pickle file.

    Args:
        database (pd.DataFrame): The updated database (e.g., a DataFrame).
        embeddings_filepath (str): The file path where the embeddings will be saved.

    Returns:
        database_embeddings: The newly generated embeddings for the database.
    """
    # Compute embeddings for the updated database 
    database_embeddings = compute_doc_embeddings(database)

    # # Save the newly computed embeddings to a pickle file
    # save_pickle(database_embeddings, embeddings_filepath)

    return database_embeddings
            
def load_embeddings(database, database_filepath, embeddings_filepath):
    """
    Loads embeddings for the given database. If the database has been updated 
    since the last time embeddings were generated, new embeddings are created 
    and saved. If the database hasn't changed, previously saved embeddings are loaded.

    Args:
        database (pd.DataFrame): The database (e.g., a DataFrame) for which embeddings need to be generated or loaded.
        database_filepath (str): The file path of the database (CSV file or similar).
        embeddings_filepath (str): The file path where the embeddings are saved (pickle file).

    Returns:
        database_embeddings: The embeddings for the database, either newly generated or loaded from the pickle file.
    """
    # # Get the timestamp of the last modification of the database file
    # database_timestamp = os.path.getmtime(database_filepath)
    
    # # Get the stored timestamp of the last database for which embeddings were generated
    # timestamp_filepath = '/home/user/app/data/db_update_timestamp.txt'
    # previous_timestamp = load_timestamp(timestamp_filepath)
    # print("Prev timestamp", previous_timestamp)
    # print("DB timestamp", database_timestamp)
    
    # # Check if the timestamp of the database file is different from the stored timestamp (DB_UPDATE_TIMESTAMP)
    # if database_timestamp == previous_timestamp:
    #     # If the database file has not been updated, load the existing embeddings from the pickle file
    #     database_embeddings = load_pickle(embeddings_filepath)
    #     print("Embeddings loaded.")
    # else:
    #     print("Embeddings updating.....")
    #     # If the database file has been updated, generate new embeddings and save them to the embeddings file
    #     database_embeddings = update_embeddings(database, embeddings_filepath)
        
    #     # Update the stored timestamp
    #     save_timestamp(database_timestamp, timestamp_filepath)
    #     print("Embeddings updated.")
    
    print("Embeddings updating.....")
    database_embeddings = update_embeddings(database, embeddings_filepath)
    print("Embeddings updated.")
        
    return database_embeddings