Adityabhaskar commited on
Commit
253800a
·
verified ·
1 Parent(s): c4c4e2a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -14
app.py CHANGED
@@ -28,24 +28,30 @@ class HybridExcelQuerySystem:
28
  --- NEW, MORE ROBUST VERSION ---
29
  Intelligently reconstructs the 'Numerical Data' sheet into a clean, tidy format.
30
  """
31
- # Find the row with month names (it's the first non-empty row)
32
- header_row_index = df.dropna(how='all').index[0]
33
- month_headers = df.iloc[header_row_index].dropna().tolist()
34
 
35
- # Find the first column with financial metrics
36
- metric_col_name = df.columns[0]
37
- df_metrics = df[[metric_col_name]].dropna()
38
 
39
- # Find the starting row and column for the actual data
40
- start_row = df_metrics.index[0]
41
- start_col = df.columns.get_loc(month_headers[0])
42
 
43
- # Extract the core data, metrics, and headers
44
- data = df.iloc[start_row:, start_col:start_col+len(month_headers)]
45
- metrics = df.iloc[start_row:, 0]
 
 
 
 
 
 
 
 
46
 
47
  # Create a new, clean DataFrame
48
- clean_df = pd.DataFrame(data.values, index=metrics, columns=month_headers)
49
 
50
  # Transpose so months are rows
51
  clean_df = clean_df.T
@@ -63,6 +69,7 @@ class HybridExcelQuerySystem:
63
 
64
  return clean_df
65
 
 
66
  def load_excel_file(self, file_path: str) -> str:
67
  self.logs.clear()
68
  try:
@@ -71,7 +78,7 @@ class HybridExcelQuerySystem:
71
  self.logs.append(f"✅ Found {len(self.sheet_names)} sheets: {', '.join(self.sheet_names)}")
72
 
73
  for sheet_name in self.sheet_names:
74
- df = pd.read_excel(file_path, sheet_name=sheet_name, header=None) # Read without a header
75
 
76
  if sheet_name == "Numerical Data":
77
  agent_df = self._pivot_numerical_data(df.copy())
 
28
  --- NEW, MORE ROBUST VERSION ---
29
  Intelligently reconstructs the 'Numerical Data' sheet into a clean, tidy format.
30
  """
31
+ # Define expected months to correctly identify the data block
32
+ months = ['Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'Jan', 'Feb', 'Mar']
 
33
 
34
+ # Find the first row that contains at least one month name
35
+ header_row_index = df[df.isin(months)].dropna(how='all').index[0]
36
+ header_series = df.iloc[header_row_index]
37
 
38
+ # Filter this row to get only the actual month names
39
+ actual_month_headers = [h for h in header_series if h in months]
 
40
 
41
+ # Find the start and end column positions of the month data
42
+ start_col_pos = header_series.tolist().index(actual_month_headers[0])
43
+ end_col_pos = header_series.tolist().index(actual_month_headers[-1])
44
+
45
+ # Find the start row of the financial metrics
46
+ metric_col = df.iloc[:, 0].dropna()
47
+ start_row_pos = metric_col.index[1] # The metrics start after "Profit & Loss Account"
48
+
49
+ # Slice the core data block
50
+ data = df.iloc[start_row_pos:, start_col_pos:end_col_pos+1]
51
+ metrics = df.iloc[start_row_pos:, 0]
52
 
53
  # Create a new, clean DataFrame
54
+ clean_df = pd.DataFrame(data.values, index=metrics, columns=actual_month_headers)
55
 
56
  # Transpose so months are rows
57
  clean_df = clean_df.T
 
69
 
70
  return clean_df
71
 
72
+
73
  def load_excel_file(self, file_path: str) -> str:
74
  self.logs.clear()
75
  try:
 
78
  self.logs.append(f"✅ Found {len(self.sheet_names)} sheets: {', '.join(self.sheet_names)}")
79
 
80
  for sheet_name in self.sheet_names:
81
+ df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
82
 
83
  if sheet_name == "Numerical Data":
84
  agent_df = self._pivot_numerical_data(df.copy())