Data_Analyzer / app.py
Gargi16's picture
Update app.py
f78939d verified
import gradio as gr
import os
import pandas as pd
from fpdf import FPDF
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Add your PDFReport class and generate_data_report function here
class PDFReport(FPDF):
def header(self):
self.set_font('Arial', 'B', 12)
self.cell(0, 10, 'Data Exploration Report', border=1, ln=1, align='C')
self.ln(10)
def chapter_title(self, title):
self.set_font('Arial', 'B', 12)
self.cell(0, 10, title, border=1, ln=1, align='C')
self.ln(5)
def chapter_body(self, text):
self.set_font('Arial', '', 10)
self.multi_cell(0, 10, text)
self.ln()
def add_table(self, headers, data, col_widths):
self.set_font('Arial', 'B', 10)
for idx, header in enumerate(headers):
self.cell(col_widths[idx], 10, header, border=1, align='C')
self.ln()
self.set_font('Arial', '', 10)
for row in data:
for idx, item in enumerate(row):
self.cell(col_widths[idx], 10, str(item), border=1)
self.ln()
def generate_data_report(data,output_file='data_report.pdf', selected_columns=None):
if isinstance(data, str):
file_path = data
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension == '.csv':
file_format = 'CSV'
elif file_extension in ['.xls', '.xlsx']:
file_format = 'Excel'
else:
file_format = 'Unknown format'
if file_format == 'CSV':
data = pd.read_csv(file_path)
elif file_format == 'Excel':
data = pd.read_excel(file_path)
else:
file_format = 'DataFrame'
file_path = "DataFrame"
pdf = PDFReport()
pdf.add_page()
pdf.set_font('Arial', 'B', 12)
pdf.cell(0, 10, f"File Name: {os.path.basename(file_path)}", ln=True, align='L')
pdf.cell(0, 10, f"File Format: {file_format}", ln=True, align='L')
pdf.cell(0, 10, f"Total Data: {data.shape[0]} rows, {data.shape[1]} columns", ln=True, align='L')
pdf.ln(10)
pdf.chapter_title("Columns with Missing Values")
total_values = len(data)
missing_values = data.isnull().sum()
missing_cols = [
[col, total_values, missing_values[col]]
for col in missing_values[missing_values > 0].index
]
if missing_cols:
pdf.add_table(["Column Name", "Total Values", "Missing Values"], missing_cols, [100, 40, 50])
else:
pdf.chapter_body("No columns with missing values.")
pdf.chapter_title("Columns Categorized by Data Type")
dtypes_summary = data.dtypes.value_counts().reset_index()
dtypes_summary.columns = ['Data Type', 'Count']
pdf.add_table(["Data Type", "Count"], dtypes_summary.values.tolist(), [100, 50])
column_types = {}
for dtype in data.dtypes.unique():
column_types[str(dtype)] = data.select_dtypes(include=[dtype]).columns.tolist()
for dtype, columns in column_types.items():
pdf.chapter_title(f"Columns of Type: {dtype}")
col_data = [[col] for col in columns]
pdf.add_table(["Column Name"], col_data, [190])
pdf.chapter_title("Constant Columns")
constant_cols = [col for col in data.columns if data[col].nunique() == 1]
if constant_cols:
constant_cols_data = [[col] for col in constant_cols]
pdf.add_table(["Constant Column Name"], constant_cols_data, [190])
data = data.drop(columns=constant_cols)
pdf.chapter_body("Constant Columns After Removal: None")
else:
pdf.chapter_body("No constant columns found.")
pdf.chapter_title("Box Plots for Numeric Columns")
numeric_cols = data.select_dtypes(include=np.number).columns
boxplot_dir = "box_plots"
os.makedirs(boxplot_dir, exist_ok=True)
boxplot_colors = ['#FF6347', '#3CB371', '#8A2BE2', '#FF4500', '#1E90FF', '#FFD700']
for idx, col in enumerate(numeric_cols):
plt.figure(figsize=(6, 4))
sns.boxplot(x=data[col], color=boxplot_colors[idx % len(boxplot_colors)])
plt.title(f"Box Plot: {col}")
plt.savefig(f"{boxplot_dir}/{col}.png")
plt.close()
pdf.add_page()
pdf.chapter_title(f"Box Plot: {col}")
pdf.image(f"{boxplot_dir}/{col}.png", w=170)
pdf.chapter_title("Distribution Charts")
dist_dir = "distribution_charts"
os.makedirs(dist_dir, exist_ok=True)
if selected_columns is None:
selected_columns = data.columns[:6]
dist_colors = ['#8B0000', '#228B22', '#DAA520', '#B0C4DE', '#9932CC', '#FF69B4']
for idx, col in enumerate(selected_columns):
plt.figure(figsize=(6, 4))
if col in numeric_cols:
sns.histplot(data[col], kde=True, color=dist_colors[idx % len(dist_colors)])
else:
data[col].value_counts().plot(kind='bar', color=dist_colors[idx % len(dist_colors)])
plt.title(f"Distribution of {col}")
plt.savefig(f"{dist_dir}/{col}.png")
plt.close()
pdf.add_page()
pdf.chapter_title(f"Distribution: {col}")
pdf.image(f"{dist_dir}/{col}.png", w=170)
pdf.output(output_file)
print(f"Report saved as {output_file}")
def generate_report(file):
file_path = file.name
output_file = "data_report.pdf"
generate_data_report(file_path, output_file=output_file)
return output_file
iface = gr.Interface(
fn=generate_report,
inputs=gr.File(label="Upload Dataset (.csv or .xlsx)"),
outputs=gr.File(label="Download PDF Report"),
title="Data Exploration Tool",
css="style.css",
description="Upload your dataset to generate a PDF data exploration report."
)
if __name__ == "__main__":
iface.launch()