Spaces:
Sleeping
Sleeping
File size: 5,731 Bytes
d68e4c1 f78939d d68e4c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import gradio as gr
import os
import pandas as pd
from fpdf import FPDF
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Add your PDFReport class and generate_data_report function here
class PDFReport(FPDF):
def header(self):
self.set_font('Arial', 'B', 12)
self.cell(0, 10, 'Data Exploration Report', border=1, ln=1, align='C')
self.ln(10)
def chapter_title(self, title):
self.set_font('Arial', 'B', 12)
self.cell(0, 10, title, border=1, ln=1, align='C')
self.ln(5)
def chapter_body(self, text):
self.set_font('Arial', '', 10)
self.multi_cell(0, 10, text)
self.ln()
def add_table(self, headers, data, col_widths):
self.set_font('Arial', 'B', 10)
for idx, header in enumerate(headers):
self.cell(col_widths[idx], 10, header, border=1, align='C')
self.ln()
self.set_font('Arial', '', 10)
for row in data:
for idx, item in enumerate(row):
self.cell(col_widths[idx], 10, str(item), border=1)
self.ln()
def generate_data_report(data,output_file='data_report.pdf', selected_columns=None):
if isinstance(data, str):
file_path = data
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension == '.csv':
file_format = 'CSV'
elif file_extension in ['.xls', '.xlsx']:
file_format = 'Excel'
else:
file_format = 'Unknown format'
if file_format == 'CSV':
data = pd.read_csv(file_path)
elif file_format == 'Excel':
data = pd.read_excel(file_path)
else:
file_format = 'DataFrame'
file_path = "DataFrame"
pdf = PDFReport()
pdf.add_page()
pdf.set_font('Arial', 'B', 12)
pdf.cell(0, 10, f"File Name: {os.path.basename(file_path)}", ln=True, align='L')
pdf.cell(0, 10, f"File Format: {file_format}", ln=True, align='L')
pdf.cell(0, 10, f"Total Data: {data.shape[0]} rows, {data.shape[1]} columns", ln=True, align='L')
pdf.ln(10)
pdf.chapter_title("Columns with Missing Values")
total_values = len(data)
missing_values = data.isnull().sum()
missing_cols = [
[col, total_values, missing_values[col]]
for col in missing_values[missing_values > 0].index
]
if missing_cols:
pdf.add_table(["Column Name", "Total Values", "Missing Values"], missing_cols, [100, 40, 50])
else:
pdf.chapter_body("No columns with missing values.")
pdf.chapter_title("Columns Categorized by Data Type")
dtypes_summary = data.dtypes.value_counts().reset_index()
dtypes_summary.columns = ['Data Type', 'Count']
pdf.add_table(["Data Type", "Count"], dtypes_summary.values.tolist(), [100, 50])
column_types = {}
for dtype in data.dtypes.unique():
column_types[str(dtype)] = data.select_dtypes(include=[dtype]).columns.tolist()
for dtype, columns in column_types.items():
pdf.chapter_title(f"Columns of Type: {dtype}")
col_data = [[col] for col in columns]
pdf.add_table(["Column Name"], col_data, [190])
pdf.chapter_title("Constant Columns")
constant_cols = [col for col in data.columns if data[col].nunique() == 1]
if constant_cols:
constant_cols_data = [[col] for col in constant_cols]
pdf.add_table(["Constant Column Name"], constant_cols_data, [190])
data = data.drop(columns=constant_cols)
pdf.chapter_body("Constant Columns After Removal: None")
else:
pdf.chapter_body("No constant columns found.")
pdf.chapter_title("Box Plots for Numeric Columns")
numeric_cols = data.select_dtypes(include=np.number).columns
boxplot_dir = "box_plots"
os.makedirs(boxplot_dir, exist_ok=True)
boxplot_colors = ['#FF6347', '#3CB371', '#8A2BE2', '#FF4500', '#1E90FF', '#FFD700']
for idx, col in enumerate(numeric_cols):
plt.figure(figsize=(6, 4))
sns.boxplot(x=data[col], color=boxplot_colors[idx % len(boxplot_colors)])
plt.title(f"Box Plot: {col}")
plt.savefig(f"{boxplot_dir}/{col}.png")
plt.close()
pdf.add_page()
pdf.chapter_title(f"Box Plot: {col}")
pdf.image(f"{boxplot_dir}/{col}.png", w=170)
pdf.chapter_title("Distribution Charts")
dist_dir = "distribution_charts"
os.makedirs(dist_dir, exist_ok=True)
if selected_columns is None:
selected_columns = data.columns[:6]
dist_colors = ['#8B0000', '#228B22', '#DAA520', '#B0C4DE', '#9932CC', '#FF69B4']
for idx, col in enumerate(selected_columns):
plt.figure(figsize=(6, 4))
if col in numeric_cols:
sns.histplot(data[col], kde=True, color=dist_colors[idx % len(dist_colors)])
else:
data[col].value_counts().plot(kind='bar', color=dist_colors[idx % len(dist_colors)])
plt.title(f"Distribution of {col}")
plt.savefig(f"{dist_dir}/{col}.png")
plt.close()
pdf.add_page()
pdf.chapter_title(f"Distribution: {col}")
pdf.image(f"{dist_dir}/{col}.png", w=170)
pdf.output(output_file)
print(f"Report saved as {output_file}")
def generate_report(file):
file_path = file.name
output_file = "data_report.pdf"
generate_data_report(file_path, output_file=output_file)
return output_file
iface = gr.Interface(
fn=generate_report,
inputs=gr.File(label="Upload Dataset (.csv or .xlsx)"),
outputs=gr.File(label="Download PDF Report"),
title="Data Exploration Tool",
css="style.css",
description="Upload your dataset to generate a PDF data exploration report."
)
if __name__ == "__main__":
iface.launch()
|