Spaces:

Chri12345
/

Assignment2

Sleeping

App Files Files Community

Chri12345 commited on Sep 11, 2024

Commit

64516cb

verified ·

1 Parent(s): ef4ac7e

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

.DS_Store +0 -0
README.md +2 -11
app.py +221 -0
requirements.txt +12 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

README.md CHANGED Viewed

@@ -1,12 +1,3 @@
----
-title: Assignment2
-emoji: 👁
-colorFrom: yellow
-colorTo: blue
-sdk: streamlit
-sdk_version: 1.38.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ ## 📜 License









2
3	+ Licensed under the MIT License. See the LICENSE file for more details. If you don't like the license, well... good luck changing it! 😄

app.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import requests
+import zipfile
+import io
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import os
+import altair as alt
+import streamlit as st
+import statsmodels.formula.api as smf
+from duckduckgo_search import DDGS
+st.title("Assignment 2: Building a Data Dashboard with Streamlit")
+st.markdown("""
+**Kiva** is a non-profit organization that facilitates microfinancing for entrepreneurs and small businesses in low-income communities around the world. By providing a platform where individuals can lend small amounts of money to borrowers in developing regions, Kiva aims to expand financial inclusion and foster economic development.
+The dataset in question encompasses a broad range of variables related to Kiva loans. It includes information on the gender of the borrowers, the amounts of the loans, the number of lenders participating in each loan, and the duration of the loans. This comprehensive dataset allows us to conduct an in-depth analysis of various dimensions of Kiva’s microfinance operations. By examining these variables, we can explore patterns and trends in borrowing behavior, loan distribution, and the impact of microfinance on different demographic groups and regions.
+""")
+st.markdown("""We have the following research question that we aim to
+        investigate and attempt to answer:
+        Do men borrow more money than women?""")
+@st.cache_data  # Cache the function to enhance performance
+def load_data():
+    # Define the file path
+    zip_url_1= "https://github.com/aaubs/ds-master/raw/main/data/assignments_datasets/KIVA/kiva_loans_part_0.csv.zip"
+    # Download the ZIP file
+    response = requests.get(zip_url_1)
+    response.raise_for_status()  # Check if the request was successful
+    # Open the ZIP file from the response content
+    with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
+        # List all files in the ZIP
+        print(zf.namelist())
+        # Read a specific CSV file from the ZIP
+        df1 = pd.read_csv(zf.open('kiva_loans_part_0.csv'))
+    return df1
+# Load the data using the defined function
+df1 = load_data()
+@st.cache_data  # Cache the function to enhance performance
+def load_data():
+    # Define the file path
+    zip_url_2= "https://github.com/aaubs/ds-master/raw/main/data/assignments_datasets/KIVA/kiva_loans_part_1.csv.zip"
+    # Download the ZIP file
+    response = requests.get(zip_url_2)
+    response.raise_for_status()  # Check if the request was successful
+    # Open the ZIP file from the response content
+    with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
+        # List all files in the ZIP
+        print(zf.namelist())
+        # Read a specific CSV file from the ZIP
+        df2 = pd.read_csv(zf.open('kiva_loans_part_1.csv'))
+    return df2
+# Load the data using the defined function
+df2 = load_data()
+@st.cache_data  # Cache the function to enhance performance
+def load_data():
+    # Define the file path
+    zip_url_3= "https://github.com/aaubs/ds-master/raw/main/data/assignments_datasets/KIVA/kiva_loans_part_2.csv.zip"
+    # Download the ZIP file
+    response = requests.get(zip_url_3)
+    response.raise_for_status()  # Check if the request was successful
+    # Open the ZIP file from the response content
+    with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
+        # List all files in the ZIP
+        print(zf.namelist())
+        # Read a specific CSV file from the ZIP
+        df3 = pd.read_csv(zf.open('kiva_loans_part_2.csv'))
+    return df3
+# Load the data using the defined function
+df3 = load_data()
+data = pd.concat([df1, df2, df3])
+data.drop(['tags'], axis = 'columns', inplace = True)
+data.dropna(inplace = True)
+valid_genders = ['male', 'female']
+data = data[data['borrower_genders'].isin(valid_genders)]
+st.subheader("""Cleaning data""")
+st.markdown("""We have eliminated the column tags, as well as the associated tags,
+        since they merely consisted of quotations such as “User favorite,”
+        among others. Additionally, these columns contained a
+        significant amount of missing data (NAs).""")
+st.text(f'We just saved {(len(data) / 671205) * 100} % of the data!')
+st.text(f'Number of remaining {len(data)} rows')
+st.subheader("Basic statistics for key variables")
+st.dataframe(data[['loan_amount','term_in_months','lender_count']].agg(['mean','var','min','median','max','sum']))
+st.markdown("""How to interpret the data?""")
+results_stat = DDGS().chat(
+    "You are an extremely good statician with lots of knowledge about statistics. "
+    "Interpret the following statistic results: " + str(data[['loan_amount','term_in_months','lender_count']].agg(['mean','var','min','median','max','sum'])) +" summarize the results in a easy understanding way and with normal text",
+    model='gpt-4o-mini')
+st.markdown(results_stat)
+st.markdown('Pick what to group by')
+selected1 = st.multiselect("Select variable1", ['loan_amount', 'term_in_months', 'lender_count'])
+st.markdown('Pick what statistic to inspect')
+selected2 = st.multiselect("Select statistic(s)", ['mean', 'var', 'min', 'median', 'max', 'sum', 'std'])
+st.markdown('Pick borrower genders to include')
+selected_genders = st.multiselect("Select borrower genders", ['male', 'female'])
+if selected1 and selected2 and selected_genders:
+    filtered_data = data[data['borrower_genders'].isin(selected_genders)]
+    st.table(filtered_data.groupby(['borrower_genders', 'sector'])[selected1].agg(selected2))
+else:
+    st.write("Please select at least one variable, one statistic, and at least one gender.")
+st.subheader("Visualizations")
+correlation_matrix = data[['loan_amount', 'term_in_months', 'lender_count']].corr(method='spearman')
+# Dropdown to select the type of visualization
+visualization_option = st.selectbox(
+    "Select Visualization 🎨",
+    ["Number of loans in sectors Distribution",
+     "Loan Amount Distribution by Gender",
+     "Loan Amount Distribution by Sector Type",
+     "KDE Plot: Loan amount based on sectors",
+     "Correlation Matrix of Loan amount, length of loan and amount of lenders"]
+)
+# Visualizations based on user selection
+if visualization_option == "Number of loans in sectors Distribution":
+    plt.figure(figsize=(12, 6))
+    # Number of loans in sectors Distribution
+    sns.histplot(data['sector'], kde=True)
+    plt.title('Number of loans in sectors Distribution')
+    plt.xlabel('Sector')
+    plt.ylabel('Frequency')
+    plt.xticks(rotation=45)
+    plt.show()
+    st.pyplot(plt, use_container_width=True)
+elif visualization_option == "KDE Plot: Loan amount based on sectors":
+    # KDE plot for Distance from Home based on Attrition
+    sns.kdeplot(data = data, x = 'loan_amount', hue = 'sector', clip = (0,4000))
+    plt.title('KDE Plot: Loan amount based on sectors')
+    st.pyplot(plt)
+elif visualization_option == "Loan Amount Distribution by Gender":
+    # Bar chart for attrition by job role
+    plt.figure(figsize=(12, 6))
+    sns.boxplot(x='borrower_genders', y='loan_amount', data=data, order=data['borrower_genders'].value_counts().index)
+    plt.title('Loan Amount Distribution by Gender')
+    plt.xlabel('Borrower Gender')
+    plt.ylabel('Loan amount')
+    plt.xticks(rotation=45)
+    plt.ylim(0, 3000)
+    st.pyplot(plt, use_container_width=True)
+elif visualization_option == "Loan Amount Distribution by Sector Type":
+    plt.figure(figsize=(12, 6))
+    sns.boxplot(x='sector', y='loan_amount', data=data, order=data['sector'].value_counts().index)
+    plt.title('Loan Amount Distribution by Sector Type')
+    plt.xlabel('Sector')
+    plt.ylabel('Loan amount')
+    plt.xticks(rotation=45)
+    plt.ylim(0, 12500)
+    st.pyplot(plt, use_container_width=True)
+elif visualization_option == "Correlation Matrix of Loan amount, length of loan and amount of lenders":
+    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm').set_title('Correlation Matrix of Loan amount, length of loan and amount of lenders')
+    st.pyplot(plt)
+st.subheader("Regression")
+data['gender_binary'] = data['borrower_genders'].apply(lambda x: 1 if x == 'male' else 0)
+model = smf.ols('loan_amount ~gender_binary+ lender_count+ term_in_months', data = data).fit()
+st.write(model.summary())
+st.subheader("""We can conclude with 73% significans that men borrow more money than women.""")
+st.subheader("The world-known economist answering the OLS-regression")
+results = DDGS().chat(
+    "You are an extremely good economist with lots of knowledge about econometrics. "
+    "Interpret the following OLS results: " + str(model.summary()) +
+    ". Specifically, answer if men borrow more money than women.",
+    model='gpt-4o-mini')
+st.markdown(results)

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+requests
+zipfile
+io
+pandas
+numpy
+matplotlib
+seaborn
+os
+altair
+streamlit
+statsmodels.formula.api
+duckduckgo_search