Chri12345 commited on
Commit
64516cb
·
verified ·
1 Parent(s): ef4ac7e

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. .DS_Store +0 -0
  2. README.md +2 -11
  3. app.py +221 -0
  4. requirements.txt +12 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
README.md CHANGED
@@ -1,12 +1,3 @@
1
- ---
2
- title: Assignment2
3
- emoji: 👁
4
- colorFrom: yellow
5
- colorTo: blue
6
- sdk: streamlit
7
- sdk_version: 1.38.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ## 📜 **License**
 
 
 
 
 
 
 
 
 
2
 
3
+ Licensed under the MIT License. See the LICENSE file for more details. If you don't like the license, well... good luck changing it! 😄
app.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import zipfile
3
+ import io
4
+ import pandas as pd
5
+ import numpy as np
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ import os
9
+ import altair as alt
10
+ import streamlit as st
11
+ import statsmodels.formula.api as smf
12
+ from duckduckgo_search import DDGS
13
+
14
+ st.title("Assignment 2: Building a Data Dashboard with Streamlit")
15
+
16
+
17
+ st.markdown("""
18
+ **Kiva** is a non-profit organization that facilitates microfinancing for entrepreneurs and small businesses in low-income communities around the world. By providing a platform where individuals can lend small amounts of money to borrowers in developing regions, Kiva aims to expand financial inclusion and foster economic development.
19
+
20
+ The dataset in question encompasses a broad range of variables related to Kiva loans. It includes information on the gender of the borrowers, the amounts of the loans, the number of lenders participating in each loan, and the duration of the loans. This comprehensive dataset allows us to conduct an in-depth analysis of various dimensions of Kiva’s microfinance operations. By examining these variables, we can explore patterns and trends in borrowing behavior, loan distribution, and the impact of microfinance on different demographic groups and regions.
21
+ """)
22
+
23
+
24
+ st.markdown("""We have the following research question that we aim to
25
+ investigate and attempt to answer:
26
+ Do men borrow more money than women?""")
27
+
28
+ @st.cache_data # Cache the function to enhance performance
29
+ def load_data():
30
+ # Define the file path
31
+ zip_url_1= "https://github.com/aaubs/ds-master/raw/main/data/assignments_datasets/KIVA/kiva_loans_part_0.csv.zip"
32
+
33
+
34
+ # Download the ZIP file
35
+ response = requests.get(zip_url_1)
36
+ response.raise_for_status() # Check if the request was successful
37
+
38
+ # Open the ZIP file from the response content
39
+ with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
40
+ # List all files in the ZIP
41
+ print(zf.namelist())
42
+
43
+ # Read a specific CSV file from the ZIP
44
+ df1 = pd.read_csv(zf.open('kiva_loans_part_0.csv'))
45
+ return df1
46
+
47
+ # Load the data using the defined function
48
+ df1 = load_data()
49
+
50
+
51
+
52
+
53
+ @st.cache_data # Cache the function to enhance performance
54
+ def load_data():
55
+ # Define the file path
56
+ zip_url_2= "https://github.com/aaubs/ds-master/raw/main/data/assignments_datasets/KIVA/kiva_loans_part_1.csv.zip"
57
+
58
+
59
+ # Download the ZIP file
60
+ response = requests.get(zip_url_2)
61
+ response.raise_for_status() # Check if the request was successful
62
+
63
+ # Open the ZIP file from the response content
64
+ with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
65
+ # List all files in the ZIP
66
+ print(zf.namelist())
67
+
68
+ # Read a specific CSV file from the ZIP
69
+ df2 = pd.read_csv(zf.open('kiva_loans_part_1.csv'))
70
+ return df2
71
+
72
+ # Load the data using the defined function
73
+ df2 = load_data()
74
+
75
+ @st.cache_data # Cache the function to enhance performance
76
+ def load_data():
77
+ # Define the file path
78
+ zip_url_3= "https://github.com/aaubs/ds-master/raw/main/data/assignments_datasets/KIVA/kiva_loans_part_2.csv.zip"
79
+
80
+
81
+ # Download the ZIP file
82
+ response = requests.get(zip_url_3)
83
+ response.raise_for_status() # Check if the request was successful
84
+
85
+ # Open the ZIP file from the response content
86
+ with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
87
+ # List all files in the ZIP
88
+ print(zf.namelist())
89
+
90
+ # Read a specific CSV file from the ZIP
91
+ df3 = pd.read_csv(zf.open('kiva_loans_part_2.csv'))
92
+ return df3
93
+
94
+ # Load the data using the defined function
95
+ df3 = load_data()
96
+
97
+ data = pd.concat([df1, df2, df3])
98
+ data.drop(['tags'], axis = 'columns', inplace = True)
99
+ data.dropna(inplace = True)
100
+
101
+ valid_genders = ['male', 'female']
102
+ data = data[data['borrower_genders'].isin(valid_genders)]
103
+
104
+ st.subheader("""Cleaning data""")
105
+ st.markdown("""We have eliminated the column tags, as well as the associated tags,
106
+ since they merely consisted of quotations such as “User favorite,”
107
+ among others. Additionally, these columns contained a
108
+ significant amount of missing data (NAs).""")
109
+
110
+ st.text(f'We just saved {(len(data) / 671205) * 100} % of the data!')
111
+ st.text(f'Number of remaining {len(data)} rows')
112
+
113
+
114
+
115
+ st.subheader("Basic statistics for key variables")
116
+ st.dataframe(data[['loan_amount','term_in_months','lender_count']].agg(['mean','var','min','median','max','sum']))
117
+
118
+ st.markdown("""How to interpret the data?""")
119
+ results_stat = DDGS().chat(
120
+ "You are an extremely good statician with lots of knowledge about statistics. "
121
+ "Interpret the following statistic results: " + str(data[['loan_amount','term_in_months','lender_count']].agg(['mean','var','min','median','max','sum'])) +" summarize the results in a easy understanding way and with normal text",
122
+ model='gpt-4o-mini')
123
+ st.markdown(results_stat)
124
+
125
+
126
+
127
+
128
+ st.markdown('Pick what to group by')
129
+ selected1 = st.multiselect("Select variable1", ['loan_amount', 'term_in_months', 'lender_count'])
130
+
131
+ st.markdown('Pick what statistic to inspect')
132
+ selected2 = st.multiselect("Select statistic(s)", ['mean', 'var', 'min', 'median', 'max', 'sum', 'std'])
133
+
134
+ st.markdown('Pick borrower genders to include')
135
+ selected_genders = st.multiselect("Select borrower genders", ['male', 'female'])
136
+
137
+ if selected1 and selected2 and selected_genders:
138
+ filtered_data = data[data['borrower_genders'].isin(selected_genders)]
139
+ st.table(filtered_data.groupby(['borrower_genders', 'sector'])[selected1].agg(selected2))
140
+ else:
141
+ st.write("Please select at least one variable, one statistic, and at least one gender.")
142
+
143
+
144
+
145
+
146
+ st.subheader("Visualizations")
147
+ correlation_matrix = data[['loan_amount', 'term_in_months', 'lender_count']].corr(method='spearman')
148
+ # Dropdown to select the type of visualization
149
+ visualization_option = st.selectbox(
150
+ "Select Visualization 🎨",
151
+ ["Number of loans in sectors Distribution",
152
+ "Loan Amount Distribution by Gender",
153
+ "Loan Amount Distribution by Sector Type",
154
+ "KDE Plot: Loan amount based on sectors",
155
+ "Correlation Matrix of Loan amount, length of loan and amount of lenders"]
156
+ )
157
+
158
+ # Visualizations based on user selection
159
+ if visualization_option == "Number of loans in sectors Distribution":
160
+ plt.figure(figsize=(12, 6))
161
+
162
+ # Number of loans in sectors Distribution
163
+ sns.histplot(data['sector'], kde=True)
164
+ plt.title('Number of loans in sectors Distribution')
165
+
166
+ plt.xlabel('Sector')
167
+ plt.ylabel('Frequency')
168
+ plt.xticks(rotation=45)
169
+ plt.show()
170
+ st.pyplot(plt, use_container_width=True)
171
+
172
+ elif visualization_option == "KDE Plot: Loan amount based on sectors":
173
+ # KDE plot for Distance from Home based on Attrition
174
+ sns.kdeplot(data = data, x = 'loan_amount', hue = 'sector', clip = (0,4000))
175
+ plt.title('KDE Plot: Loan amount based on sectors')
176
+ st.pyplot(plt)
177
+
178
+ elif visualization_option == "Loan Amount Distribution by Gender":
179
+ # Bar chart for attrition by job role
180
+ plt.figure(figsize=(12, 6))
181
+ sns.boxplot(x='borrower_genders', y='loan_amount', data=data, order=data['borrower_genders'].value_counts().index)
182
+ plt.title('Loan Amount Distribution by Gender')
183
+ plt.xlabel('Borrower Gender')
184
+ plt.ylabel('Loan amount')
185
+ plt.xticks(rotation=45)
186
+ plt.ylim(0, 3000)
187
+ st.pyplot(plt, use_container_width=True)
188
+
189
+ elif visualization_option == "Loan Amount Distribution by Sector Type":
190
+ plt.figure(figsize=(12, 6))
191
+ sns.boxplot(x='sector', y='loan_amount', data=data, order=data['sector'].value_counts().index)
192
+ plt.title('Loan Amount Distribution by Sector Type')
193
+ plt.xlabel('Sector')
194
+ plt.ylabel('Loan amount')
195
+ plt.xticks(rotation=45)
196
+ plt.ylim(0, 12500)
197
+ st.pyplot(plt, use_container_width=True)
198
+
199
+ elif visualization_option == "Correlation Matrix of Loan amount, length of loan and amount of lenders":
200
+ sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm').set_title('Correlation Matrix of Loan amount, length of loan and amount of lenders')
201
+ st.pyplot(plt)
202
+
203
+
204
+
205
+
206
+ st.subheader("Regression")
207
+ data['gender_binary'] = data['borrower_genders'].apply(lambda x: 1 if x == 'male' else 0)
208
+ model = smf.ols('loan_amount ~gender_binary+ lender_count+ term_in_months', data = data).fit()
209
+ st.write(model.summary())
210
+
211
+ st.subheader("""We can conclude with 73% significans that men borrow more money than women.""")
212
+
213
+
214
+ st.subheader("The world-known economist answering the OLS-regression")
215
+
216
+ results = DDGS().chat(
217
+ "You are an extremely good economist with lots of knowledge about econometrics. "
218
+ "Interpret the following OLS results: " + str(model.summary()) +
219
+ ". Specifically, answer if men borrow more money than women.",
220
+ model='gpt-4o-mini')
221
+ st.markdown(results)
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ requests
2
+ zipfile
3
+ io
4
+ pandas
5
+ numpy
6
+ matplotlib
7
+ seaborn
8
+ os
9
+ altair
10
+ streamlit
11
+ statsmodels.formula.api
12
+ duckduckgo_search