Gargi16 commited on
Commit
d68e4c1
·
verified ·
1 Parent(s): 5592f47

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -0
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import pandas as pd
4
+ from fpdf import FPDF
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+ import numpy as np
8
+
9
+ # Add your PDFReport class and generate_data_report function here
10
+ class PDFReport(FPDF):
11
+ def header(self):
12
+ self.set_font('Arial', 'B', 12)
13
+ self.cell(0, 10, 'Data Exploration Report', border=1, ln=1, align='C')
14
+ self.ln(10)
15
+
16
+ def chapter_title(self, title):
17
+ self.set_font('Arial', 'B', 12)
18
+ self.cell(0, 10, title, border=1, ln=1, align='C')
19
+ self.ln(5)
20
+
21
+ def chapter_body(self, text):
22
+ self.set_font('Arial', '', 10)
23
+ self.multi_cell(0, 10, text)
24
+ self.ln()
25
+
26
+ def add_table(self, headers, data, col_widths):
27
+ self.set_font('Arial', 'B', 10)
28
+ for idx, header in enumerate(headers):
29
+ self.cell(col_widths[idx], 10, header, border=1, align='C')
30
+ self.ln()
31
+ self.set_font('Arial', '', 10)
32
+ for row in data:
33
+ for idx, item in enumerate(row):
34
+ self.cell(col_widths[idx], 10, str(item), border=1)
35
+ self.ln()
36
+ def generate_data_report(data,output_file='data_report.pdf', selected_columns=None):
37
+ if isinstance(data, str):
38
+ file_path = data
39
+ file_extension = os.path.splitext(file_path)[1].lower()
40
+ if file_extension == '.csv':
41
+ file_format = 'CSV'
42
+ elif file_extension in ['.xls', '.xlsx']:
43
+ file_format = 'Excel'
44
+ else:
45
+ file_format = 'Unknown format'
46
+ if file_format == 'CSV':
47
+ data = pd.read_csv(file_path)
48
+ elif file_format == 'Excel':
49
+ data = pd.read_excel(file_path)
50
+ else:
51
+ file_format = 'DataFrame'
52
+ file_path = "DataFrame"
53
+
54
+ pdf = PDFReport()
55
+ pdf.add_page()
56
+
57
+ pdf.set_font('Arial', 'B', 12)
58
+ pdf.cell(0, 10, f"File Name: {os.path.basename(file_path)}", ln=True, align='L')
59
+ pdf.cell(0, 10, f"File Format: {file_format}", ln=True, align='L')
60
+ pdf.cell(0, 10, f"Total Data: {data.shape[0]} rows, {data.shape[1]} columns", ln=True, align='L')
61
+ pdf.ln(10)
62
+
63
+ pdf.chapter_title("Columns with Missing Values")
64
+ total_values = len(data)
65
+ missing_values = data.isnull().sum()
66
+ missing_cols = [
67
+ [col, total_values, missing_values[col]]
68
+ for col in missing_values[missing_values > 0].index
69
+ ]
70
+ if missing_cols:
71
+ pdf.add_table(["Column Name", "Total Values", "Missing Values"], missing_cols, [100, 40, 50])
72
+ else:
73
+ pdf.chapter_body("No columns with missing values.")
74
+
75
+ pdf.chapter_title("Columns Categorized by Data Type")
76
+ dtypes_summary = data.dtypes.value_counts().reset_index()
77
+ dtypes_summary.columns = ['Data Type', 'Count']
78
+ pdf.add_table(["Data Type", "Count"], dtypes_summary.values.tolist(), [100, 50])
79
+
80
+ column_types = {}
81
+ for dtype in data.dtypes.unique():
82
+ column_types[str(dtype)] = data.select_dtypes(include=[dtype]).columns.tolist()
83
+
84
+ for dtype, columns in column_types.items():
85
+ pdf.chapter_title(f"Columns of Type: {dtype}")
86
+ col_data = [[col] for col in columns]
87
+ pdf.add_table(["Column Name"], col_data, [190])
88
+
89
+ pdf.chapter_title("Constant Columns")
90
+ constant_cols = [col for col in data.columns if data[col].nunique() == 1]
91
+ if constant_cols:
92
+ constant_cols_data = [[col] for col in constant_cols]
93
+ pdf.add_table(["Constant Column Name"], constant_cols_data, [190])
94
+ data = data.drop(columns=constant_cols)
95
+ pdf.chapter_body("Constant Columns After Removal: None")
96
+ else:
97
+ pdf.chapter_body("No constant columns found.")
98
+
99
+ pdf.chapter_title("Box Plots for Numeric Columns")
100
+ numeric_cols = data.select_dtypes(include=np.number).columns
101
+ boxplot_dir = "box_plots"
102
+ os.makedirs(boxplot_dir, exist_ok=True)
103
+
104
+ boxplot_colors = ['#FF6347', '#3CB371', '#8A2BE2', '#FF4500', '#1E90FF', '#FFD700']
105
+
106
+ for idx, col in enumerate(numeric_cols):
107
+ plt.figure(figsize=(6, 4))
108
+ sns.boxplot(x=data[col], color=boxplot_colors[idx % len(boxplot_colors)])
109
+ plt.title(f"Box Plot: {col}")
110
+ plt.savefig(f"{boxplot_dir}/{col}.png")
111
+ plt.close()
112
+ pdf.add_page()
113
+ pdf.chapter_title(f"Box Plot: {col}")
114
+ pdf.image(f"{boxplot_dir}/{col}.png", w=170)
115
+
116
+ pdf.chapter_title("Distribution Charts")
117
+ dist_dir = "distribution_charts"
118
+ os.makedirs(dist_dir, exist_ok=True)
119
+ if selected_columns is None:
120
+ selected_columns = data.columns[:6]
121
+
122
+ dist_colors = ['#8B0000', '#228B22', '#DAA520', '#B0C4DE', '#9932CC', '#FF69B4']
123
+
124
+ for idx, col in enumerate(selected_columns):
125
+ plt.figure(figsize=(6, 4))
126
+ if col in numeric_cols:
127
+ sns.histplot(data[col], kde=True, color=dist_colors[idx % len(dist_colors)])
128
+ else:
129
+ data[col].value_counts().plot(kind='bar', color=dist_colors[idx % len(dist_colors)])
130
+ plt.title(f"Distribution of {col}")
131
+ plt.savefig(f"{dist_dir}/{col}.png")
132
+ plt.close()
133
+ pdf.add_page()
134
+ pdf.chapter_title(f"Distribution: {col}")
135
+ pdf.image(f"{dist_dir}/{col}.png", w=170)
136
+
137
+ pdf.output(output_file)
138
+ print(f"Report saved as {output_file}")
139
+ def generate_report(file):
140
+ file_path = file.name
141
+ output_file = "data_report.pdf"
142
+ generate_data_report(file_path, output_file=output_file)
143
+ return output_file
144
+
145
+ iface = gr.Interface(
146
+ fn=generate_report,
147
+ inputs=gr.File(label="Upload Dataset (.csv or .xlsx)"),
148
+ outputs=gr.File(label="Download PDF Report"),
149
+ title="Data Exploration Tool",
150
+ description="Upload your dataset to generate a PDF data exploration report."
151
+ )
152
+
153
+ if __name__ == "__main__":
154
+ iface.launch()