Spaces:
Sleeping
Sleeping
File size: 8,751 Bytes
64516cb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
import requests
import zipfile
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import altair as alt
import streamlit as st
import statsmodels.formula.api as smf
from duckduckgo_search import DDGS
st.title("Assignment 2: Building a Data Dashboard with Streamlit")
st.markdown("""
**Kiva** is a non-profit organization that facilitates microfinancing for entrepreneurs and small businesses in low-income communities around the world. By providing a platform where individuals can lend small amounts of money to borrowers in developing regions, Kiva aims to expand financial inclusion and foster economic development.
The dataset in question encompasses a broad range of variables related to Kiva loans. It includes information on the gender of the borrowers, the amounts of the loans, the number of lenders participating in each loan, and the duration of the loans. This comprehensive dataset allows us to conduct an in-depth analysis of various dimensions of Kiva’s microfinance operations. By examining these variables, we can explore patterns and trends in borrowing behavior, loan distribution, and the impact of microfinance on different demographic groups and regions.
""")
st.markdown("""We have the following research question that we aim to
investigate and attempt to answer:
Do men borrow more money than women?""")
@st.cache_data # Cache the function to enhance performance
def load_data():
# Define the file path
zip_url_1= "https://github.com/aaubs/ds-master/raw/main/data/assignments_datasets/KIVA/kiva_loans_part_0.csv.zip"
# Download the ZIP file
response = requests.get(zip_url_1)
response.raise_for_status() # Check if the request was successful
# Open the ZIP file from the response content
with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
# List all files in the ZIP
print(zf.namelist())
# Read a specific CSV file from the ZIP
df1 = pd.read_csv(zf.open('kiva_loans_part_0.csv'))
return df1
# Load the data using the defined function
df1 = load_data()
@st.cache_data # Cache the function to enhance performance
def load_data():
# Define the file path
zip_url_2= "https://github.com/aaubs/ds-master/raw/main/data/assignments_datasets/KIVA/kiva_loans_part_1.csv.zip"
# Download the ZIP file
response = requests.get(zip_url_2)
response.raise_for_status() # Check if the request was successful
# Open the ZIP file from the response content
with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
# List all files in the ZIP
print(zf.namelist())
# Read a specific CSV file from the ZIP
df2 = pd.read_csv(zf.open('kiva_loans_part_1.csv'))
return df2
# Load the data using the defined function
df2 = load_data()
@st.cache_data # Cache the function to enhance performance
def load_data():
# Define the file path
zip_url_3= "https://github.com/aaubs/ds-master/raw/main/data/assignments_datasets/KIVA/kiva_loans_part_2.csv.zip"
# Download the ZIP file
response = requests.get(zip_url_3)
response.raise_for_status() # Check if the request was successful
# Open the ZIP file from the response content
with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
# List all files in the ZIP
print(zf.namelist())
# Read a specific CSV file from the ZIP
df3 = pd.read_csv(zf.open('kiva_loans_part_2.csv'))
return df3
# Load the data using the defined function
df3 = load_data()
data = pd.concat([df1, df2, df3])
data.drop(['tags'], axis = 'columns', inplace = True)
data.dropna(inplace = True)
valid_genders = ['male', 'female']
data = data[data['borrower_genders'].isin(valid_genders)]
st.subheader("""Cleaning data""")
st.markdown("""We have eliminated the column tags, as well as the associated tags,
since they merely consisted of quotations such as “User favorite,”
among others. Additionally, these columns contained a
significant amount of missing data (NAs).""")
st.text(f'We just saved {(len(data) / 671205) * 100} % of the data!')
st.text(f'Number of remaining {len(data)} rows')
st.subheader("Basic statistics for key variables")
st.dataframe(data[['loan_amount','term_in_months','lender_count']].agg(['mean','var','min','median','max','sum']))
st.markdown("""How to interpret the data?""")
results_stat = DDGS().chat(
"You are an extremely good statician with lots of knowledge about statistics. "
"Interpret the following statistic results: " + str(data[['loan_amount','term_in_months','lender_count']].agg(['mean','var','min','median','max','sum'])) +" summarize the results in a easy understanding way and with normal text",
model='gpt-4o-mini')
st.markdown(results_stat)
st.markdown('Pick what to group by')
selected1 = st.multiselect("Select variable1", ['loan_amount', 'term_in_months', 'lender_count'])
st.markdown('Pick what statistic to inspect')
selected2 = st.multiselect("Select statistic(s)", ['mean', 'var', 'min', 'median', 'max', 'sum', 'std'])
st.markdown('Pick borrower genders to include')
selected_genders = st.multiselect("Select borrower genders", ['male', 'female'])
if selected1 and selected2 and selected_genders:
filtered_data = data[data['borrower_genders'].isin(selected_genders)]
st.table(filtered_data.groupby(['borrower_genders', 'sector'])[selected1].agg(selected2))
else:
st.write("Please select at least one variable, one statistic, and at least one gender.")
st.subheader("Visualizations")
correlation_matrix = data[['loan_amount', 'term_in_months', 'lender_count']].corr(method='spearman')
# Dropdown to select the type of visualization
visualization_option = st.selectbox(
"Select Visualization 🎨",
["Number of loans in sectors Distribution",
"Loan Amount Distribution by Gender",
"Loan Amount Distribution by Sector Type",
"KDE Plot: Loan amount based on sectors",
"Correlation Matrix of Loan amount, length of loan and amount of lenders"]
)
# Visualizations based on user selection
if visualization_option == "Number of loans in sectors Distribution":
plt.figure(figsize=(12, 6))
# Number of loans in sectors Distribution
sns.histplot(data['sector'], kde=True)
plt.title('Number of loans in sectors Distribution')
plt.xlabel('Sector')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
st.pyplot(plt, use_container_width=True)
elif visualization_option == "KDE Plot: Loan amount based on sectors":
# KDE plot for Distance from Home based on Attrition
sns.kdeplot(data = data, x = 'loan_amount', hue = 'sector', clip = (0,4000))
plt.title('KDE Plot: Loan amount based on sectors')
st.pyplot(plt)
elif visualization_option == "Loan Amount Distribution by Gender":
# Bar chart for attrition by job role
plt.figure(figsize=(12, 6))
sns.boxplot(x='borrower_genders', y='loan_amount', data=data, order=data['borrower_genders'].value_counts().index)
plt.title('Loan Amount Distribution by Gender')
plt.xlabel('Borrower Gender')
plt.ylabel('Loan amount')
plt.xticks(rotation=45)
plt.ylim(0, 3000)
st.pyplot(plt, use_container_width=True)
elif visualization_option == "Loan Amount Distribution by Sector Type":
plt.figure(figsize=(12, 6))
sns.boxplot(x='sector', y='loan_amount', data=data, order=data['sector'].value_counts().index)
plt.title('Loan Amount Distribution by Sector Type')
plt.xlabel('Sector')
plt.ylabel('Loan amount')
plt.xticks(rotation=45)
plt.ylim(0, 12500)
st.pyplot(plt, use_container_width=True)
elif visualization_option == "Correlation Matrix of Loan amount, length of loan and amount of lenders":
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm').set_title('Correlation Matrix of Loan amount, length of loan and amount of lenders')
st.pyplot(plt)
st.subheader("Regression")
data['gender_binary'] = data['borrower_genders'].apply(lambda x: 1 if x == 'male' else 0)
model = smf.ols('loan_amount ~gender_binary+ lender_count+ term_in_months', data = data).fit()
st.write(model.summary())
st.subheader("""We can conclude with 73% significans that men borrow more money than women.""")
st.subheader("The world-known economist answering the OLS-regression")
results = DDGS().chat(
"You are an extremely good economist with lots of knowledge about econometrics. "
"Interpret the following OLS results: " + str(model.summary()) +
". Specifically, answer if men borrow more money than women.",
model='gpt-4o-mini')
st.markdown(results)
|