Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -28,24 +28,30 @@ class HybridExcelQuerySystem:
|
|
28 |
--- NEW, MORE ROBUST VERSION ---
|
29 |
Intelligently reconstructs the 'Numerical Data' sheet into a clean, tidy format.
|
30 |
"""
|
31 |
-
#
|
32 |
-
|
33 |
-
month_headers = df.iloc[header_row_index].dropna().tolist()
|
34 |
|
35 |
-
# Find the first
|
36 |
-
|
37 |
-
|
38 |
|
39 |
-
#
|
40 |
-
|
41 |
-
start_col = df.columns.get_loc(month_headers[0])
|
42 |
|
43 |
-
#
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
# Create a new, clean DataFrame
|
48 |
-
clean_df = pd.DataFrame(data.values, index=metrics, columns=
|
49 |
|
50 |
# Transpose so months are rows
|
51 |
clean_df = clean_df.T
|
@@ -63,6 +69,7 @@ class HybridExcelQuerySystem:
|
|
63 |
|
64 |
return clean_df
|
65 |
|
|
|
66 |
def load_excel_file(self, file_path: str) -> str:
|
67 |
self.logs.clear()
|
68 |
try:
|
@@ -71,7 +78,7 @@ class HybridExcelQuerySystem:
|
|
71 |
self.logs.append(f"✅ Found {len(self.sheet_names)} sheets: {', '.join(self.sheet_names)}")
|
72 |
|
73 |
for sheet_name in self.sheet_names:
|
74 |
-
df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
|
75 |
|
76 |
if sheet_name == "Numerical Data":
|
77 |
agent_df = self._pivot_numerical_data(df.copy())
|
|
|
28 |
--- NEW, MORE ROBUST VERSION ---
|
29 |
Intelligently reconstructs the 'Numerical Data' sheet into a clean, tidy format.
|
30 |
"""
|
31 |
+
# Define expected months to correctly identify the data block
|
32 |
+
months = ['Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'Jan', 'Feb', 'Mar']
|
|
|
33 |
|
34 |
+
# Find the first row that contains at least one month name
|
35 |
+
header_row_index = df[df.isin(months)].dropna(how='all').index[0]
|
36 |
+
header_series = df.iloc[header_row_index]
|
37 |
|
38 |
+
# Filter this row to get only the actual month names
|
39 |
+
actual_month_headers = [h for h in header_series if h in months]
|
|
|
40 |
|
41 |
+
# Find the start and end column positions of the month data
|
42 |
+
start_col_pos = header_series.tolist().index(actual_month_headers[0])
|
43 |
+
end_col_pos = header_series.tolist().index(actual_month_headers[-1])
|
44 |
+
|
45 |
+
# Find the start row of the financial metrics
|
46 |
+
metric_col = df.iloc[:, 0].dropna()
|
47 |
+
start_row_pos = metric_col.index[1] # The metrics start after "Profit & Loss Account"
|
48 |
+
|
49 |
+
# Slice the core data block
|
50 |
+
data = df.iloc[start_row_pos:, start_col_pos:end_col_pos+1]
|
51 |
+
metrics = df.iloc[start_row_pos:, 0]
|
52 |
|
53 |
# Create a new, clean DataFrame
|
54 |
+
clean_df = pd.DataFrame(data.values, index=metrics, columns=actual_month_headers)
|
55 |
|
56 |
# Transpose so months are rows
|
57 |
clean_df = clean_df.T
|
|
|
69 |
|
70 |
return clean_df
|
71 |
|
72 |
+
|
73 |
def load_excel_file(self, file_path: str) -> str:
|
74 |
self.logs.clear()
|
75 |
try:
|
|
|
78 |
self.logs.append(f"✅ Found {len(self.sheet_names)} sheets: {', '.join(self.sheet_names)}")
|
79 |
|
80 |
for sheet_name in self.sheet_names:
|
81 |
+
df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
|
82 |
|
83 |
if sheet_name == "Numerical Data":
|
84 |
agent_df = self._pivot_numerical_data(df.copy())
|