Spaces:
Sleeping
Sleeping
Update utils/file_utils.py
Browse files- utils/file_utils.py +50 -48
utils/file_utils.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import os
|
2 |
import pickle
|
3 |
import pandas as pd
|
4 |
-
|
5 |
|
6 |
def load_database(file_path: str) -> pd.DataFrame:
|
7 |
"""
|
@@ -43,24 +43,7 @@ def load_pickle(file_path: str):
|
|
43 |
except Exception as e:
|
44 |
print(f"Error reading Pickle file '{file_path}': {e}")
|
45 |
return None
|
46 |
-
|
47 |
-
# def save_pickle(embeddings: dict, file_path: str) -> None:
|
48 |
-
# """
|
49 |
-
# Saves to a pickle file safely.
|
50 |
-
|
51 |
-
# Args:
|
52 |
-
# embeddings (dict): The embeddings to be saved.
|
53 |
-
# file_path (str): The file path where the embeddings will be saved.
|
54 |
-
|
55 |
-
# Returns:
|
56 |
-
# None
|
57 |
-
# """
|
58 |
-
# try:
|
59 |
-
# with open(file_path, "wb") as file:
|
60 |
-
# pickle.dump(embeddings, file)
|
61 |
-
# except Exception as e:
|
62 |
-
# print(f"Error saving embeddings to '{file_path}': {e}")
|
63 |
-
|
64 |
def load_file(file_path: str) -> str:
|
65 |
"""
|
66 |
Reads the text from a file safely.
|
@@ -81,6 +64,23 @@ def load_file(file_path: str) -> str:
|
|
81 |
print(f"Error reading file '{file_path}': {e}")
|
82 |
return ""
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
# def save_timestamp(timestamp: float, file_path: str):
|
85 |
# """
|
86 |
# Saves the timestamp to a file to persist across sessions.
|
@@ -98,43 +98,43 @@ def load_file(file_path: str) -> str:
|
|
98 |
# except Exception as e:
|
99 |
# print(f"Error saving timestamp: {e}")
|
100 |
|
101 |
-
# def
|
102 |
# """
|
103 |
-
#
|
104 |
|
105 |
# Args:
|
106 |
-
#
|
|
|
107 |
|
108 |
# Returns:
|
109 |
-
#
|
110 |
# """
|
111 |
-
# timestamp_str = load_file(file_path) # Use load_file function to read the file content
|
112 |
# try:
|
113 |
-
#
|
114 |
-
#
|
115 |
-
#
|
116 |
-
#
|
117 |
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
#
|
130 |
-
|
131 |
|
132 |
-
#
|
133 |
-
#
|
134 |
|
135 |
-
|
136 |
|
137 |
-
|
138 |
"""
|
139 |
Loads embeddings for the given database. If the database has been updated
|
140 |
since the last time embeddings were generated, new embeddings are created
|
@@ -150,14 +150,12 @@ def load_file(file_path: str) -> str:
|
|
150 |
"""
|
151 |
# # Get the timestamp of the last modification of the database file
|
152 |
# database_timestamp = os.path.getmtime(database_filepath)
|
153 |
-
|
154 |
# # Get the stored timestamp of the last database for which embeddings were generated
|
155 |
# timestamp_filepath = '/home/user/app/data/db_update_timestamp.txt'
|
156 |
# previous_timestamp = load_timestamp(timestamp_filepath)
|
157 |
# print("Prev timestamp", previous_timestamp)
|
158 |
# print("DB timestamp", database_timestamp)
|
159 |
-
# database_embeddings = load_pickle(embeddings_filepath)
|
160 |
-
# print("Embeddings loaded.")
|
161 |
|
162 |
# # Check if the timestamp of the database file is different from the stored timestamp (DB_UPDATE_TIMESTAMP)
|
163 |
# if database_timestamp == previous_timestamp:
|
@@ -172,5 +170,9 @@ def load_file(file_path: str) -> str:
|
|
172 |
# # Update the stored timestamp
|
173 |
# save_timestamp(database_timestamp, timestamp_filepath)
|
174 |
# print("Embeddings updated.")
|
175 |
-
|
176 |
-
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import pickle
|
3 |
import pandas as pd
|
4 |
+
from utils.embedding_generation import compute_doc_embeddings
|
5 |
|
6 |
def load_database(file_path: str) -> pd.DataFrame:
|
7 |
"""
|
|
|
43 |
except Exception as e:
|
44 |
print(f"Error reading Pickle file '{file_path}': {e}")
|
45 |
return None
|
46 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
def load_file(file_path: str) -> str:
|
48 |
"""
|
49 |
Reads the text from a file safely.
|
|
|
64 |
print(f"Error reading file '{file_path}': {e}")
|
65 |
return ""
|
66 |
|
67 |
+
# def load_timestamp(file_path: str) -> float:
|
68 |
+
# """
|
69 |
+
# Loads the timestamp from a file.
|
70 |
+
|
71 |
+
# Args:
|
72 |
+
# file_path (str): The file path from which the timestamp will be read.
|
73 |
+
|
74 |
+
# Returns:
|
75 |
+
# float: The timestamp read from the file. Returns 0.0 if there is an error or no timestamp is found.
|
76 |
+
# """
|
77 |
+
# timestamp_str = load_file(file_path) # Use load_file function to read the file content
|
78 |
+
# try:
|
79 |
+
# return float(timestamp_str) # Convert the string to a float
|
80 |
+
# except ValueError:
|
81 |
+
# print(f"Error: The content in '{file_path}' is not a valid float.")
|
82 |
+
# return 0.0 # Return a default value if the content is not valid
|
83 |
+
|
84 |
# def save_timestamp(timestamp: float, file_path: str):
|
85 |
# """
|
86 |
# Saves the timestamp to a file to persist across sessions.
|
|
|
98 |
# except Exception as e:
|
99 |
# print(f"Error saving timestamp: {e}")
|
100 |
|
101 |
+
# def save_pickle(embeddings: dict, file_path: str) -> None:
|
102 |
# """
|
103 |
+
# Saves to a pickle file safely.
|
104 |
|
105 |
# Args:
|
106 |
+
# embeddings (dict): The embeddings to be saved.
|
107 |
+
# file_path (str): The file path where the embeddings will be saved.
|
108 |
|
109 |
# Returns:
|
110 |
+
# None
|
111 |
# """
|
|
|
112 |
# try:
|
113 |
+
# with open(file_path, "wb") as file:
|
114 |
+
# pickle.dump(embeddings, file)
|
115 |
+
# except Exception as e:
|
116 |
+
# print(f"Error saving embeddings to '{file_path}': {e}")
|
117 |
|
118 |
+
def update_embeddings(database:pd.DataFrame, embeddings_filepath: str):
|
119 |
+
"""
|
120 |
+
Generates new embeddings for the updated database and saves them as a pickle file.
|
121 |
|
122 |
+
Args:
|
123 |
+
database (pd.DataFrame): The updated database (e.g., a DataFrame).
|
124 |
+
embeddings_filepath (str): The file path where the embeddings will be saved.
|
125 |
|
126 |
+
Returns:
|
127 |
+
database_embeddings: The newly generated embeddings for the database.
|
128 |
+
"""
|
129 |
+
# Compute embeddings for the updated database
|
130 |
+
database_embeddings = compute_doc_embeddings(database)
|
131 |
|
132 |
+
# # Save the newly computed embeddings to a pickle file
|
133 |
+
# save_pickle(database_embeddings, embeddings_filepath)
|
134 |
|
135 |
+
return database_embeddings
|
136 |
|
137 |
+
def load_embeddings(database, database_filepath, embeddings_filepath):
|
138 |
"""
|
139 |
Loads embeddings for the given database. If the database has been updated
|
140 |
since the last time embeddings were generated, new embeddings are created
|
|
|
150 |
"""
|
151 |
# # Get the timestamp of the last modification of the database file
|
152 |
# database_timestamp = os.path.getmtime(database_filepath)
|
153 |
+
|
154 |
# # Get the stored timestamp of the last database for which embeddings were generated
|
155 |
# timestamp_filepath = '/home/user/app/data/db_update_timestamp.txt'
|
156 |
# previous_timestamp = load_timestamp(timestamp_filepath)
|
157 |
# print("Prev timestamp", previous_timestamp)
|
158 |
# print("DB timestamp", database_timestamp)
|
|
|
|
|
159 |
|
160 |
# # Check if the timestamp of the database file is different from the stored timestamp (DB_UPDATE_TIMESTAMP)
|
161 |
# if database_timestamp == previous_timestamp:
|
|
|
170 |
# # Update the stored timestamp
|
171 |
# save_timestamp(database_timestamp, timestamp_filepath)
|
172 |
# print("Embeddings updated.")
|
173 |
+
|
174 |
+
print("Embeddings updating.....")
|
175 |
+
database_embeddings = update_embeddings(database, embeddings_filepath)
|
176 |
+
print("Embeddings updated.")
|
177 |
+
|
178 |
+
return database_embeddings
|