File size: 5,731 Bytes
d68e4c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f78939d
d68e4c1
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import gradio as gr
import os
import pandas as pd
from fpdf import FPDF
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Add your PDFReport class and generate_data_report function here
class PDFReport(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Data Exploration Report', border=1, ln=1, align='C')
        self.ln(10)

    def chapter_title(self, title):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, title, border=1, ln=1, align='C')
        self.ln(5)

    def chapter_body(self, text):
        self.set_font('Arial', '', 10)
        self.multi_cell(0, 10, text)
        self.ln()

    def add_table(self, headers, data, col_widths):
        self.set_font('Arial', 'B', 10)
        for idx, header in enumerate(headers):
            self.cell(col_widths[idx], 10, header, border=1, align='C')
        self.ln()
        self.set_font('Arial', '', 10)
        for row in data:
            for idx, item in enumerate(row):
                self.cell(col_widths[idx], 10, str(item), border=1)
            self.ln()
def generate_data_report(data,output_file='data_report.pdf', selected_columns=None):
    if isinstance(data, str):
        file_path = data
        file_extension = os.path.splitext(file_path)[1].lower()
        if file_extension == '.csv':
            file_format = 'CSV'
        elif file_extension in ['.xls', '.xlsx']:
            file_format = 'Excel'
        else:
            file_format = 'Unknown format'
        if file_format == 'CSV':
            data = pd.read_csv(file_path)
        elif file_format == 'Excel':
            data = pd.read_excel(file_path)
    else:
        file_format = 'DataFrame'
        file_path = "DataFrame"

    pdf = PDFReport()
    pdf.add_page()

    pdf.set_font('Arial', 'B', 12)
    pdf.cell(0, 10, f"File Name: {os.path.basename(file_path)}", ln=True, align='L')
    pdf.cell(0, 10, f"File Format: {file_format}", ln=True, align='L')
    pdf.cell(0, 10, f"Total Data: {data.shape[0]} rows, {data.shape[1]} columns", ln=True, align='L')
    pdf.ln(10)

    pdf.chapter_title("Columns with Missing Values")
    total_values = len(data)
    missing_values = data.isnull().sum()
    missing_cols = [
        [col, total_values, missing_values[col]]
        for col in missing_values[missing_values > 0].index
    ]
    if missing_cols:
        pdf.add_table(["Column Name", "Total Values", "Missing Values"], missing_cols, [100, 40, 50])
    else:
        pdf.chapter_body("No columns with missing values.")

    pdf.chapter_title("Columns Categorized by Data Type")
    dtypes_summary = data.dtypes.value_counts().reset_index()
    dtypes_summary.columns = ['Data Type', 'Count']
    pdf.add_table(["Data Type", "Count"], dtypes_summary.values.tolist(), [100, 50])

    column_types = {}
    for dtype in data.dtypes.unique():
        column_types[str(dtype)] = data.select_dtypes(include=[dtype]).columns.tolist()

    for dtype, columns in column_types.items():
        pdf.chapter_title(f"Columns of Type: {dtype}")
        col_data = [[col] for col in columns]
        pdf.add_table(["Column Name"], col_data, [190])

    pdf.chapter_title("Constant Columns")
    constant_cols = [col for col in data.columns if data[col].nunique() == 1]
    if constant_cols:
        constant_cols_data = [[col] for col in constant_cols]
        pdf.add_table(["Constant Column Name"], constant_cols_data, [190])
        data = data.drop(columns=constant_cols)
        pdf.chapter_body("Constant Columns After Removal: None")
    else:
        pdf.chapter_body("No constant columns found.")

    pdf.chapter_title("Box Plots for Numeric Columns")
    numeric_cols = data.select_dtypes(include=np.number).columns
    boxplot_dir = "box_plots"
    os.makedirs(boxplot_dir, exist_ok=True)

    boxplot_colors = ['#FF6347', '#3CB371', '#8A2BE2', '#FF4500', '#1E90FF', '#FFD700']

    for idx, col in enumerate(numeric_cols):
        plt.figure(figsize=(6, 4))
        sns.boxplot(x=data[col], color=boxplot_colors[idx % len(boxplot_colors)])
        plt.title(f"Box Plot: {col}")
        plt.savefig(f"{boxplot_dir}/{col}.png")
        plt.close()
        pdf.add_page()
        pdf.chapter_title(f"Box Plot: {col}")
        pdf.image(f"{boxplot_dir}/{col}.png", w=170)

    pdf.chapter_title("Distribution Charts")
    dist_dir = "distribution_charts"
    os.makedirs(dist_dir, exist_ok=True)
    if selected_columns is None:
        selected_columns = data.columns[:6]

    dist_colors = ['#8B0000', '#228B22', '#DAA520', '#B0C4DE', '#9932CC', '#FF69B4']

    for idx, col in enumerate(selected_columns):
        plt.figure(figsize=(6, 4))
        if col in numeric_cols:
            sns.histplot(data[col], kde=True, color=dist_colors[idx % len(dist_colors)])
        else:
            data[col].value_counts().plot(kind='bar', color=dist_colors[idx % len(dist_colors)])
        plt.title(f"Distribution of {col}")
        plt.savefig(f"{dist_dir}/{col}.png")
        plt.close()
        pdf.add_page()
        pdf.chapter_title(f"Distribution: {col}")
        pdf.image(f"{dist_dir}/{col}.png", w=170)

    pdf.output(output_file)
    print(f"Report saved as {output_file}")
def generate_report(file):
    file_path = file.name
    output_file = "data_report.pdf"
    generate_data_report(file_path, output_file=output_file)
    return output_file

iface = gr.Interface(
    fn=generate_report,
    inputs=gr.File(label="Upload Dataset (.csv or .xlsx)"),
    outputs=gr.File(label="Download PDF Report"),
    title="Data Exploration Tool",
    css="style.css",
    description="Upload your dataset to generate a PDF data exploration report."
)

if __name__ == "__main__":
    iface.launch()