AashitaK commited on
Commit
9014f4d
·
verified ·
1 Parent(s): edd3ed5

Update utils/file_utils.py

Browse files
Files changed (1) hide show
  1. utils/file_utils.py +50 -48
utils/file_utils.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  import pickle
3
  import pandas as pd
4
- # from utils.embedding_generation import compute_doc_embeddings
5
 
6
  def load_database(file_path: str) -> pd.DataFrame:
7
  """
@@ -43,24 +43,7 @@ def load_pickle(file_path: str):
43
  except Exception as e:
44
  print(f"Error reading Pickle file '{file_path}': {e}")
45
  return None
46
-
47
- # def save_pickle(embeddings: dict, file_path: str) -> None:
48
- # """
49
- # Saves to a pickle file safely.
50
-
51
- # Args:
52
- # embeddings (dict): The embeddings to be saved.
53
- # file_path (str): The file path where the embeddings will be saved.
54
-
55
- # Returns:
56
- # None
57
- # """
58
- # try:
59
- # with open(file_path, "wb") as file:
60
- # pickle.dump(embeddings, file)
61
- # except Exception as e:
62
- # print(f"Error saving embeddings to '{file_path}': {e}")
63
-
64
  def load_file(file_path: str) -> str:
65
  """
66
  Reads the text from a file safely.
@@ -81,6 +64,23 @@ def load_file(file_path: str) -> str:
81
  print(f"Error reading file '{file_path}': {e}")
82
  return ""
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  # def save_timestamp(timestamp: float, file_path: str):
85
  # """
86
  # Saves the timestamp to a file to persist across sessions.
@@ -98,43 +98,43 @@ def load_file(file_path: str) -> str:
98
  # except Exception as e:
99
  # print(f"Error saving timestamp: {e}")
100
 
101
- # def load_timestamp(file_path: str) -> float:
102
  # """
103
- # Loads the timestamp from a file.
104
 
105
  # Args:
106
- # file_path (str): The file path from which the timestamp will be read.
 
107
 
108
  # Returns:
109
- # float: The timestamp read from the file. Returns 0.0 if there is an error or no timestamp is found.
110
  # """
111
- # timestamp_str = load_file(file_path) # Use load_file function to read the file content
112
  # try:
113
- # return float(timestamp_str) # Convert the string to a float
114
- # except ValueError:
115
- # print(f"Error: The content in '{file_path}' is not a valid float.")
116
- # return 0.0 # Return a default value if the content is not valid
117
 
118
- # def update_embeddings(database:pd.DataFrame, embeddings_filepath: str):
119
- # """
120
- # Generates new embeddings for the updated database and saves them as a pickle file.
121
 
122
- # Args:
123
- # database (pd.DataFrame): The updated database (e.g., a DataFrame).
124
- # embeddings_filepath (str): The file path where the embeddings will be saved.
125
 
126
- # Returns:
127
- # database_embeddings: The newly generated embeddings for the database.
128
- # """
129
- # # Compute embeddings for the updated database
130
- # database_embeddings = compute_doc_embeddings(database)
131
 
132
- # # Save the newly computed embeddings to a pickle file
133
- # save_pickle(database_embeddings, embeddings_filepath)
134
 
135
- # return database_embeddings
136
 
137
- # def load_embeddings(database, database_filepath, embeddings_filepath):
138
  """
139
  Loads embeddings for the given database. If the database has been updated
140
  since the last time embeddings were generated, new embeddings are created
@@ -150,14 +150,12 @@ def load_file(file_path: str) -> str:
150
  """
151
  # # Get the timestamp of the last modification of the database file
152
  # database_timestamp = os.path.getmtime(database_filepath)
153
-
154
  # # Get the stored timestamp of the last database for which embeddings were generated
155
  # timestamp_filepath = '/home/user/app/data/db_update_timestamp.txt'
156
  # previous_timestamp = load_timestamp(timestamp_filepath)
157
  # print("Prev timestamp", previous_timestamp)
158
  # print("DB timestamp", database_timestamp)
159
- # database_embeddings = load_pickle(embeddings_filepath)
160
- # print("Embeddings loaded.")
161
 
162
  # # Check if the timestamp of the database file is different from the stored timestamp (DB_UPDATE_TIMESTAMP)
163
  # if database_timestamp == previous_timestamp:
@@ -172,5 +170,9 @@ def load_file(file_path: str) -> str:
172
  # # Update the stored timestamp
173
  # save_timestamp(database_timestamp, timestamp_filepath)
174
  # print("Embeddings updated.")
175
-
176
- # return database_embeddings
 
 
 
 
 
1
  import os
2
  import pickle
3
  import pandas as pd
4
+ from utils.embedding_generation import compute_doc_embeddings
5
 
6
  def load_database(file_path: str) -> pd.DataFrame:
7
  """
 
43
  except Exception as e:
44
  print(f"Error reading Pickle file '{file_path}': {e}")
45
  return None
46
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  def load_file(file_path: str) -> str:
48
  """
49
  Reads the text from a file safely.
 
64
  print(f"Error reading file '{file_path}': {e}")
65
  return ""
66
 
67
+ # def load_timestamp(file_path: str) -> float:
68
+ # """
69
+ # Loads the timestamp from a file.
70
+
71
+ # Args:
72
+ # file_path (str): The file path from which the timestamp will be read.
73
+
74
+ # Returns:
75
+ # float: The timestamp read from the file. Returns 0.0 if there is an error or no timestamp is found.
76
+ # """
77
+ # timestamp_str = load_file(file_path) # Use load_file function to read the file content
78
+ # try:
79
+ # return float(timestamp_str) # Convert the string to a float
80
+ # except ValueError:
81
+ # print(f"Error: The content in '{file_path}' is not a valid float.")
82
+ # return 0.0 # Return a default value if the content is not valid
83
+
84
  # def save_timestamp(timestamp: float, file_path: str):
85
  # """
86
  # Saves the timestamp to a file to persist across sessions.
 
98
  # except Exception as e:
99
  # print(f"Error saving timestamp: {e}")
100
 
101
+ # def save_pickle(embeddings: dict, file_path: str) -> None:
102
  # """
103
+ # Saves to a pickle file safely.
104
 
105
  # Args:
106
+ # embeddings (dict): The embeddings to be saved.
107
+ # file_path (str): The file path where the embeddings will be saved.
108
 
109
  # Returns:
110
+ # None
111
  # """
 
112
  # try:
113
+ # with open(file_path, "wb") as file:
114
+ # pickle.dump(embeddings, file)
115
+ # except Exception as e:
116
+ # print(f"Error saving embeddings to '{file_path}': {e}")
117
 
118
+ def update_embeddings(database:pd.DataFrame, embeddings_filepath: str):
119
+ """
120
+ Generates new embeddings for the updated database and saves them as a pickle file.
121
 
122
+ Args:
123
+ database (pd.DataFrame): The updated database (e.g., a DataFrame).
124
+ embeddings_filepath (str): The file path where the embeddings will be saved.
125
 
126
+ Returns:
127
+ database_embeddings: The newly generated embeddings for the database.
128
+ """
129
+ # Compute embeddings for the updated database
130
+ database_embeddings = compute_doc_embeddings(database)
131
 
132
+ # # Save the newly computed embeddings to a pickle file
133
+ # save_pickle(database_embeddings, embeddings_filepath)
134
 
135
+ return database_embeddings
136
 
137
+ def load_embeddings(database, database_filepath, embeddings_filepath):
138
  """
139
  Loads embeddings for the given database. If the database has been updated
140
  since the last time embeddings were generated, new embeddings are created
 
150
  """
151
  # # Get the timestamp of the last modification of the database file
152
  # database_timestamp = os.path.getmtime(database_filepath)
153
+
154
  # # Get the stored timestamp of the last database for which embeddings were generated
155
  # timestamp_filepath = '/home/user/app/data/db_update_timestamp.txt'
156
  # previous_timestamp = load_timestamp(timestamp_filepath)
157
  # print("Prev timestamp", previous_timestamp)
158
  # print("DB timestamp", database_timestamp)
 
 
159
 
160
  # # Check if the timestamp of the database file is different from the stored timestamp (DB_UPDATE_TIMESTAMP)
161
  # if database_timestamp == previous_timestamp:
 
170
  # # Update the stored timestamp
171
  # save_timestamp(database_timestamp, timestamp_filepath)
172
  # print("Embeddings updated.")
173
+
174
+ print("Embeddings updating.....")
175
+ database_embeddings = update_embeddings(database, embeddings_filepath)
176
+ print("Embeddings updated.")
177
+
178
+ return database_embeddings