DebopamC commited on
Commit
fae86c5
·
verified ·
1 Parent(s): 4f291f6

Upload 20 files

Browse files
.dockerignore ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Git
2
+ .git
3
+ .gitignore
4
+ .gitattributes
5
+
6
+
7
+ # CI
8
+ .codeclimate.yml
9
+ .travis.yml
10
+ .taskcluster.yml
11
+
12
+ # Docker
13
+ docker-compose.yml
14
+ Dockerfile
15
+ .docker
16
+ .dockerignore
17
+
18
+ # Byte-compiled / optimized / DLL files
19
+ **/__pycache__/
20
+ **/*.py[cod]
21
+
22
+ # C extensions
23
+ *.so
24
+
25
+ # Distribution / packaging
26
+ .Python
27
+ env/
28
+ build/
29
+ develop-eggs/
30
+ dist/
31
+ downloads/
32
+ eggs/
33
+ lib/
34
+ lib64/
35
+ parts/
36
+ sdist/
37
+ var/
38
+ *.egg-info/
39
+ .installed.cfg
40
+ *.egg
41
+
42
+ # PyInstaller
43
+ # Usually these files are written by a python script from a template
44
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
45
+ *.manifest
46
+ *.spec
47
+
48
+ # Installer logs
49
+ pip-log.txt
50
+ pip-delete-this-directory.txt
51
+
52
+ # Unit test / coverage reports
53
+ htmlcov/
54
+ .tox/
55
+ .coverage
56
+ .cache
57
+ nosetests.xml
58
+ coverage.xml
59
+
60
+ # Translations
61
+ *.mo
62
+ *.pot
63
+
64
+ # Django stuff:
65
+ *.log
66
+
67
+ # Sphinx documentation
68
+ docs/_build/
69
+
70
+ # PyBuilder
71
+ target/
72
+
73
+ # Virtual environment
74
+ .env
75
+ .venv/
76
+ venv/
77
+
78
+ # PyCharm
79
+ .idea
80
+
81
+ # Python mode for VIM
82
+ .ropeproject
83
+ **/.ropeproject
84
+
85
+ # Vim swap files
86
+ **/*.swp
87
+
88
+ # VS Code
89
+ .vscode/
90
+
91
+ start.sh
.gitignore ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # PyPI configuration file
171
+ .pypirc
.streamlit/config.toml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [server]
2
+ enableStaticServing = true
3
+ maxUploadSize = 30
4
+
5
+ [client]
6
+ showErrorDetails = true
7
+
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim-bookworm
2
+
3
+ WORKDIR /app
4
+
5
+ # Update package index and install necessary tools, including curl
6
+ RUN apt-get update && apt-get install -y curl
7
+
8
+ # Download the model
9
+ RUN curl -Lo qwen2.5-coder-3b-instruct-q4_k_m.gguf https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct-GGUF/resolve/main/qwen2.5-coder-3b-instruct-q4_k_m.gguf?download=true
10
+
11
+ # Install build tools required for llama-cpp-python
12
+ RUN apt-get update && apt-get install -y build-essential
13
+
14
+ # Copy requirements and install dependencies
15
+ COPY requirements.txt .
16
+ RUN pip install --no-cache-dir -r requirements.txt
17
+
18
+ # Redundant command to invalidate cache
19
+ RUN echo "Intentionally invalidating cache"
20
+
21
+ # Copy the application code
22
+ COPY . .
23
+
24
+ # Expose the port that Streamlit will run on
25
+ EXPOSE 8501
26
+
27
+ # Define the entrypoint to run the Streamlit application
28
+ ENTRYPOINT ["streamlit", "run", "🤖SQL_Agent.py", "--server.port=8501", "--server.address=0.0.0.0"]
pages/2 📊Manual SQL Executer.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from utils.sql_utils import load_data, load_defaultdb_schema_text
3
+ from utils.handle_sql_commands import execute_sql_duckdb
4
+
5
+ st.markdown(
6
+ """
7
+ <style>
8
+ /* Base styles for both themes */
9
+ .stPageLink {
10
+ background-image: linear-gradient(to right, #007BFF, #6610F2); /* Gradient background */
11
+ color: white !important; /* Ensure text is readable on the gradient */
12
+ padding: 12px 20px !important; /* Slightly larger padding */
13
+ border-radius: 8px !important; /* More rounded corners */
14
+ border: none !important; /* Remove default border */
15
+ text-decoration: none !important;
16
+ font-weight: 500 !important; /* Slightly lighter font weight */
17
+ transition: transform 0.2s ease-in-out, box-shadow 0.2s ease-in-out; /* Smooth transitions */
18
+ box-shadow: 0 2px 5px rgba(0, 0, 0, 0.15); /* Subtle shadow for depth */
19
+ display: inline-flex;
20
+ align-items: center;
21
+ justify-content: center;
22
+ }
23
+
24
+ .stPageLink:hover {
25
+ transform: scale(1.03); /* Slight scale up on hover */
26
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2); /* Increased shadow on hover */
27
+ }
28
+
29
+ .stPageLink span { /* Style the label text */
30
+ margin-left: 5px; /* Space between icon and text */
31
+ }
32
+
33
+ /* Dark theme adjustments (optional, if needed for better contrast) */
34
+ /* Consider using Streamlit's theme variables if possible for a more robust solution */
35
+ /* For simplicity, this example uses fixed colors that should work reasonably well */
36
+ /* [data-theme="dark"] .stPageLink {
37
+ }
38
+
39
+ [data-theme="dark"] .stPageLink:hover {
40
+ } */
41
+ </style>
42
+ """,
43
+ unsafe_allow_html=True,
44
+ )
45
+
46
+ default_dfs = load_data()
47
+
48
+ if "uploaded_dataframes" not in st.session_state:
49
+ st.session_state.uploaded_dataframes = {}
50
+
51
+ uploaded_dataframes = st.session_state.uploaded_dataframes
52
+
53
+
54
+ with st.popover("Click here to see Database Schema", use_container_width=True):
55
+ uploaded_df_schema = st.session_state.get("uploaded_df_schema", False)
56
+
57
+ choice = st.segmented_control(
58
+ "Choose",
59
+ ["Default DB", "Uploaded Files"],
60
+ label_visibility="collapsed",
61
+ disabled=uploaded_df_schema == False,
62
+ default="Default DB" if uploaded_df_schema == False else "Uploaded Files",
63
+ )
64
+
65
+ if uploaded_df_schema is False:
66
+ st.markdown("""> If you want to use your own files(csv/xls), click here - """)
67
+ st.page_link(
68
+ page="pages/3 📂File Upload for SQL.py",
69
+ label="Upload your own CSV or Excel files",
70
+ icon="📜",
71
+ )
72
+ schema = load_defaultdb_schema_text()
73
+ st.markdown(schema, unsafe_allow_html=True)
74
+ elif choice == "Default DB":
75
+ schema = load_defaultdb_schema_text()
76
+ st.markdown(schema, unsafe_allow_html=True)
77
+ else:
78
+ pretty_schema, markdown = st.tabs(["Schema", "Copy Schema in Markdown"])
79
+ with pretty_schema:
80
+ st.info(
81
+ "You can copy this schema, and give it to any state of the art LLM models like (Gemini /ChatGPT /Claude etc) to cross check your answers.\n You can run the queries directly here, by using ***Manual Query Executer*** in the sidebar and download your results 😊",
82
+ icon="ℹ️",
83
+ )
84
+ st.markdown(uploaded_df_schema, unsafe_allow_html=True)
85
+ with markdown:
86
+ st.info(
87
+ "You can copy this schema, and give it to any state of the art LLM models like (Gemini /ChatGPT /Claude etc) to cross check your answers.\n You can run the queries directly here, by using ***Manual Query Executer*** in the sidebar and download your results 😊",
88
+ icon="ℹ️",
89
+ )
90
+ st.markdown(f"```\n{uploaded_df_schema}\n```")
91
+
92
+
93
+ data_source = None
94
+ if uploaded_dataframes is None or len(uploaded_dataframes) == 0:
95
+ data_source = "Default Database"
96
+ col1, col2 = st.columns([4, 1])
97
+ with col1:
98
+ st.caption(
99
+ "Use this dialog to execute SQL commands on the Default Database. To use your own Excel/CSV files"
100
+ )
101
+ with col2:
102
+ st.page_link(page="pages/3 📂File Upload for SQL.py", label="Click Here")
103
+ else:
104
+ data_source = st.radio("Select Data Source", ["Default Database", "Uploaded Files"])
105
+
106
+ if data_source == "Default Database":
107
+ dataframes = default_dfs
108
+ if uploaded_dataframes is not None and len(uploaded_dataframes) > 0:
109
+ st.caption(
110
+ "Use this dialog to execute SQL commands on the Default Database. To use your own Excel/CSV files"
111
+ )
112
+ else:
113
+ dataframes = uploaded_dataframes
114
+ st.caption("Use this dialog to execute SQL commands on Your Uploaded Files")
115
+
116
+ sql_command = st.text_area("Enter your SQL command here:")
117
+ if st.button("Execute"):
118
+ df = execute_sql_duckdb(sql_command, dataframes)
119
+ if df is not None:
120
+ st.dataframe(df)
121
+ st.info(f"Rows x Columns: {df.shape[0]} x {df.shape[1]}")
122
+ st.subheader("Data Description:")
123
+ st.markdown(df.describe().T.to_markdown())
124
+ st.subheader("Data Types:")
125
+ st.write(df.dtypes)
126
+ else:
127
+ st.error(
128
+ "An error occurred while executing the SQL command. Please cross check your command as well as tables & columns names."
129
+ )
pages/3 📂File Upload for SQL.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from utils.sql_utils import create_schema
4
+
5
+ MAX_FILES = 5
6
+
7
+ if "uploaded_dataframes" not in st.session_state:
8
+ st.session_state.uploaded_dataframes = {}
9
+
10
+ st.header("Upload your CSV or Excel files")
11
+ st.caption("Maximum 5 files can be uploaded.")
12
+
13
+ num_uploaded_files = len(st.session_state.uploaded_dataframes)
14
+
15
+ disabled = num_uploaded_files >= MAX_FILES
16
+
17
+ uploaded_files = st.file_uploader(
18
+ "Choose files",
19
+ type=["csv", "xlsx"],
20
+ accept_multiple_files=True,
21
+ disabled=disabled,
22
+ )
23
+
24
+ if uploaded_files and not disabled:
25
+ uploaded_count = 0
26
+ for uploaded_file in uploaded_files:
27
+ if len(st.session_state.uploaded_dataframes) < MAX_FILES:
28
+ df = None
29
+ try:
30
+ if uploaded_file.name.endswith(".csv"):
31
+ df = pd.read_csv(uploaded_file)
32
+ elif uploaded_file.name.endswith(".xlsx"):
33
+ df = pd.read_excel(uploaded_file)
34
+
35
+ if uploaded_file.name in st.session_state.uploaded_dataframes:
36
+ st.toast(
37
+ f"File {uploaded_file.name} already uploaded. Skipping...",
38
+ icon="⚠️",
39
+ )
40
+ else:
41
+ key = (
42
+ uploaded_file.name.lower()
43
+ .strip()
44
+ .replace(" ", "_")
45
+ .replace(".csv", "")
46
+ .replace(".xlsx", "")
47
+ )
48
+ st.session_state.uploaded_dataframes[key] = df
49
+ uploaded_count += 1
50
+ print(f"Uploaded file: {str(uploaded_file.name)}")
51
+ except Exception as e:
52
+ st.error(f"Error reading file {uploaded_file.name}: {e}")
53
+ else:
54
+ st.warning(
55
+ f"Maximum number of files({MAX_FILES}) reached. Cannot upload more files."
56
+ )
57
+ break
58
+ if uploaded_count > 0:
59
+ st.success(
60
+ f"{uploaded_count} File(s) uploaded successfully. Total {len(st.session_state.uploaded_dataframes)} File(s)"
61
+ )
62
+
63
+
64
+ if len(st.session_state.uploaded_dataframes) >= MAX_FILES:
65
+ st.warning(
66
+ f"Maximum number of files({MAX_FILES}) reached. Cannot upload more files."
67
+ )
68
+
69
+
70
+ dataframes = st.session_state.uploaded_dataframes
71
+
72
+ if dataframes:
73
+ st.header("Schema of Uploaded Files📚")
74
+ schema = create_schema(dataframes)
75
+ if "uploaded_df_schema" not in st.session_state:
76
+ st.session_state.uploaded_df_schema = schema
77
+ pretty_schema, markdown = st.tabs(["Schema", "Copy Schema in Markdown"])
78
+ with pretty_schema:
79
+ st.info(
80
+ "You can copy this schema, and give it to any state of the art LLM models like (Gemini /ChatGPT /Claude etc) to cross check your answers.\n You can run the queries directly here, by using ***Manual Query Executer*** in the sidebar and download your results 😊",
81
+ icon="ℹ️",
82
+ )
83
+ st.markdown(schema, unsafe_allow_html=True)
84
+ with markdown:
85
+ st.info(
86
+ "You can copy this schema, and give it to any state of the art LLM models like (Gemini /ChatGPT /Claude etc) to cross check your answers.\n You can run the queries directly here, by using ***Manual Query Executer*** in the sidebar and download your results 😊",
87
+ icon="ℹ️",
88
+ )
89
+ st.markdown(f"```\n{schema}\n```")
90
+
pages/4 👨‍🎓Connect with me.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.markdown(
4
+ """
5
+ <style>
6
+ body {
7
+ background-color: #1e1e1e;
8
+ color: #f0f0f0;
9
+ font-family: sans-serif;
10
+ }
11
+ h1, h2, h3, h4, h5, h6 {
12
+ color: #f0f0f0;
13
+ }
14
+ a {
15
+ color: #89b4fa;
16
+ text-decoration: none;
17
+ }
18
+ a:hover {
19
+ text-decoration: underline;
20
+ }
21
+ .social-link-button {
22
+ display: inline-block;
23
+ padding: 8px 16px;
24
+ margin: 4px;
25
+ background-color: #333;
26
+ color: #fff;
27
+ border-radius: 5px;
28
+ text-decoration: none;
29
+ }
30
+ .social-link-button:hover {
31
+ background-color: #555;
32
+ }
33
+ .project-card {
34
+ background-color: #282828;
35
+ border: 1px solid #444;
36
+ }
37
+ .skills, .experience, .projects, .volunteering, .achievements, .certifications, .education {
38
+ background-color: #282828;
39
+ border: 1px solid #444;
40
+ }
41
+ .status.deployed {
42
+ color: #a3be8c;
43
+ }
44
+ .status.ongoing {
45
+ color: #f9bb6d;
46
+ }
47
+ footer {
48
+ background-color: #333;
49
+ color: #f0f0f0;
50
+ }
51
+ </style>
52
+ """,
53
+ unsafe_allow_html=True,
54
+ )
55
+
56
+ st.html(
57
+ """<!DOCTYPE html>
58
+ <html lang="en">
59
+ <head>
60
+ <meta charset="UTF-8">
61
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
62
+ <title>Debopam Chowdhury (Param)</title>
63
+ </head>
64
+ <body>
65
+ <header style="text-align: center; padding: 2rem 0; border-bottom: 1px solid #444;">
66
+ <img src="https://tinyurl.com/ysa6yekw" alt="Debopam Chowdhury (Param)" class="profile-image" style="width: 150px; height: 150px; border-radius: 50%; object-fit: cover; margin-bottom: 1rem;">
67
+ <h1>Debopam Chowdhury <span class="nickname" style="font-weight: normal; color: #999;">(Param)</span></h1>
68
+ <p class="tagline" style="color: #ccc; margin-bottom: 1rem;">Creator of <a href="http://aigymbuddy.in" target="_blank" style="color: #89b4fa;">AiGymBuddy.in</a> | Machine Learning | Deep Learning | Flutter | Math | MLops | TensorFlow | FastAPI</p>
69
+ <p class="pronouns" style="font-size: 0.9rem; color: #999; margin-bottom: 1rem;">(He/Him)</p>
70
+ <div class="social-links">
71
+ <a href="https://www.linkedin.com/in/debopam-chowdhury-param-600619229/" target="_blank" class="social-link-button">LinkedIn</a>
72
+ <a href="https://www.youtube.com/@DCparam/featured" target="_blank" class="social-link-button">YouTube</a>
73
+ <a href="https://github.com/DebopamParam" target="_blank" class="social-link-button">GitHub</a>
74
+ <a href="https://www.instagram.com/debopam_param.ai/" target="_blank" class="social-link-button">Instagram</a>
75
+ </div>
76
+ </header>
77
+
78
+ <section class="about-me" style="padding: 2rem; margin-bottom: 1.5rem; border-radius: 8px;">
79
+ <h2>About Me</h2>
80
+ <p style="line-height: 1.7;">Passionate about Machine Learning, Computer Science and Mathematics. Has a strong grip over Machine & Deep Learning Fundamentals, Computer Networks, OS and DSA. Solved 100+ problems on leetcode. Have a keen interest in learning Mathematics, Deep Learning and Statistics. I like to understand things in a deeper way.</p>
81
+ </section>
82
+
83
+ <section class="skills" style="padding: 2rem; margin-bottom: 1.5rem; border-radius: 8px;">
84
+ <h2>Skills</h2>
85
+ <div class="skills-grid" style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 1.5rem;">
86
+ <div class="skill-category">
87
+ <h3>Technical</h3>
88
+ <ul style="padding-left: 0; list-style: none;">
89
+ <li style="margin-bottom: 0.5rem; font-size: 0.95rem;"><strong>Programming:</strong> Python, Java, Dart, VS Code, Git, GitHub, Jupyter Notebooks, CI/CD</li>
90
+ <li style="margin-bottom: 0.5rem; font-size: 0.95rem;"><strong>Machine Learning & Deep Learning:</strong> TensorFlow, PyTorch, Scikit-learn, Keras, Supervised Learning, Unsupervised Learning, Neural Networks, Sequence Modeling, Convolution, Attention Mechanisms, Transformer, GPT, BERT, Hyperparameter Optimization</li>
91
+ <li style="margin-bottom: 0.5rem; font-size: 0.95rem;"><strong>Data Handling:</strong> Pandas, NumPy, Data Manipulation, Data Preparation, SQL, Pyspark, NoSQL</li>
92
+ <li style="margin-bottom: 0.5rem; font-size: 0.95rem;"><strong>Cloud & DevOps:</strong> Docker, AWS, Cloud-AI, FastAPI</li>
93
+ <li style="margin-bottom: 0.5rem; font-size: 0.95rem;"><strong>Mathematics:</strong> Linear Algebra, Probability, Statistics, Boosting Methods</li>
94
+ <li style="margin-bottom: 0.5rem; font-size: 0.95rem;"><strong>Frameworks & Tools:</strong> Flutter, Firebase, Deep Learning Frameworks, Model Training & Optimization, Version Control, LLM FineTuning</li>
95
+ <li style="margin-bottom: 0.5rem; font-size: 0.95rem;"><strong>Generative AI:</strong> Vector Embeddings, Indexing, Chunking, RAG pipelines, LlamaIndex, LangChain, Colpali, Byaldi, Chroma DB</li>
96
+ </ul>
97
+ </div>
98
+ </div>
99
+ </section>
100
+
101
+ <section class="experience" style="padding: 2rem; margin-bottom: 1.5rem; border-radius: 8px;">
102
+ <h2>Experience</h2>
103
+ <div class="experience-item" style="margin-bottom: 1.5rem;">
104
+ <h3>GENERATIVE AI ENGINEER (Contract - Remote)</h3>
105
+ <p class="company" style="font-weight: bold; color: #bbb; margin-bottom: 0.25rem;">Private Client | Sydney, Australia</p>
106
+ <p class="duration" style="font-size: 0.9rem; color: #999; margin-bottom: 0.75rem;">October - November 2024</p>
107
+ <ul style="margin-top: 0.5rem; padding-left: 20px;">
108
+ <li style="margin-bottom: 0.5rem; font-size: 0.95rem;">Developed secure, on-premise solutions for private Complex data (Complex PDF with Images and Charts) Q&A and knowledge retrieval.</li>
109
+ <li style="margin-bottom: 0.5rem; font-size: 0.95rem;">Built data ingestion pipeline (100-500 documents daily) with vision embeddings and task scheduler updates.</li>
110
+ <li style="margin-bottom: 0.5rem; font-size: 0.95rem;">Built multimodal RAG pipelines (Byaldi, Colqwen2, Pixtral 12B) optimized for diverse document types (70% accuracy improvement).</li>
111
+ <li style="margin-bottom: 0.5rem; font-size: 0.95rem;">Containerized the application with Docker for deployment flexibility.</li>
112
+ <li style="margin-bottom: 0.5rem; font-size: 0.95rem;"><strong>Open Source Contribution - Byaldi - 575✩:</strong> while working on It</li>
113
+ <li style="margin-bottom: 0.5rem; font-size: 0.95rem;"><strong>Technologies:</strong> NLP, Vision Embeddings, Local Multimodal RAG, LangChain, Pixtral 12B, Col-Qwen2, Byaldi</li>
114
+ </ul>
115
+ </div>
116
+ </section>
117
+
118
+ <section class="projects" style="padding: 2rem; margin-bottom: 1.5rem; border-radius: 8px;">
119
+ <h2>Projects</h2>
120
+ <div class="project-list" style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 1.5rem;">
121
+ <div class="project-card" style="border: 1px solid #444; padding: 1.5rem; border-radius: 8px;">
122
+ <h3 style="margin-top: 0; margin-bottom: 0.75rem;">LLM-finetuning and SQL Agent with Auto_Execution with DuckDB, Schema Retriever from CSVs, manual SQL executer.</h3>
123
+ <p class="status ongoing" style="font-size: 0.85rem; font-weight: bold; margin-top: 0.5rem; color: #f9bb6d;">Current WebAPP which you are using</p>
124
+ <p class="status deployed" style="font-size: 0.85rem; font-weight: bold; margin-top: 0.5rem; color: #a3be8c;">Deployed <i class="fas fa-check-circle"></i></p>
125
+ </div>
126
+
127
+ <div class="project-card" style="border: 1px solid #444; padding: 1.5rem; border-radius: 8px;">
128
+ <h3 style="margin-top: 0; margin-bottom: 0.75rem;">Deep Learning Based Recommendation System</h3>
129
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;">Initially, I aimed to build a recommendation system from scratch using TensorFlow Recommenders (TFRS) on the MovieLens 1M dataset. This involved creating user and movie embeddings with a candidate generation and ranking model.</p>
130
+
131
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;">However, this TFRS approach proved too resource-intensive and time-consuming for effective training and testing. Crucially, the initial results weren't satisfactory for deployment.</p>
132
+
133
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;">Therefore, for the deployed web application, I switched to pre-trained models (BGE embeddings and re-ranking). This offered:</p>
134
+ <ul style="padding-left: 20px;">
135
+ <li style="margin-bottom: 0.5rem; font-size: 0.95rem;"><b>Better Performance:</b> More relevant recommendations.</li>
136
+ <li style="margin-bottom: 0.5rem; font-size: 0.95rem;"><b>Reduced Resources/Time:</b> Faster training and deployment.</li>
137
+ </ul>
138
+
139
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;">While the TFRS code is also included in the web app, the pre-trained model approach was chosen for its superior results and efficiency in a deployment setting. A future improvement could be fine-tuning the pre-trained models for even better performance.</p>
140
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Technologies used:</strong> TensorFlow Recommenders, Scann, Vector DB, Distributed GPU Training, Langchain, Streamlit, BAAI BGE Models</p>
141
+ <div class="project-links">
142
+ <a href="https://debopam-movie-recommendation-system.streamlit.app/" target="_blank" style="display: inline-block; margin-right: 1rem; font-size: 0.9rem;">Try it out APP Live</a>
143
+ </div>
144
+ <p class="status deployed" style="font-size: 0.85rem; font-weight: bold; margin-top: 0.5rem; color: #a3be8c;">Deployed <i class="fas fa-check-circle"></i></p>
145
+ </div>
146
+
147
+ <div class="project-card" style="border: 1px solid #444; padding: 1.5rem; border-radius: 8px;">
148
+ <h3 style="margin-top: 0; margin-bottom: 0.75rem;">IBM EMPLOYEE ATTRITION PREDICTOR (End to End with Deployment to AWS, FastAPI with Proxy Server)</h3>
149
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Objective:</strong> Predicted employee attrition with 85% AUC to improve employee retention and business performance.</p>
150
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Model Development:</strong> Hyperparameter optimized Multi-Layer Perceptron, XGBoost, Logistic Regression with Inference as well as Training Pipeline</p>
151
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Backend:</strong> Developed a FastAPI backend for real-time predictions, using Pydantic for schema validation for incoming and outgoing requests</p>
152
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Deployment:</strong> Containerized with Docker, deployed on AWS EC2, managed via AWS ECR.</p>
153
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>CI/CD:</strong> Set up an automated CI/CD pipeline using GitHub Actions for seamless updates.</p>
154
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Web Application:</strong> Built a user-friendly interface using Flutter Web for real-time interaction.</p>
155
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Security:</strong> Handled HTTPS requests using Caddy as a reverse proxy server</p>
156
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Technologies used:</strong> TensorFlow, AWS, Docker, FastAPI, CI/CD Pipeline, Multi-Layer Perceptron, Neural Network, XGBoost, Logistic Regression, Hyperparameter Tuned Models, GitHub Actions, Pydantic, Flutter Web, Reverse-Proxy-Server: Caddy</p>
157
+ <div class="project-links">
158
+ <a href="https://www.linkedin.com/posts/debopam-chowdhury-param-600619229_machinelearning-deeplearning-aws-activity-7244476917884608512-DfbD/?utm_source=share&utm_medium=member_desktop" target="_blank" style="display: inline-block; margin-right: 1rem; font-size: 0.9rem;">Explanation & Live Demo</a>
159
+ <a href="http://www.debopamchowdhury.works" target="_blank" style="display: inline-block; margin-right: 1rem; font-size: 0.9rem;">Click to try out Live</a>
160
+ </div>
161
+ <p class="status deployed" style="font-size: 0.85rem; font-weight: bold; margin-top: 0.5rem; color: #a3be8c;">Deployed <i class="fas fa-check-circle"></i></p>
162
+ </div>
163
+
164
+ <div class="project-card" style="border: 1px solid #444; padding: 1.5rem; border-radius: 8px;">
165
+ <h3 style="margin-top: 0; margin-bottom: 0.75rem;">AI GYM BUDDY (Langchain | Flutter | Riverpod | Gemini)</h3>
166
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;">Personalized AI-Driven Workouts with Smart Equipment Detection and Progress Tracking</p>
167
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Features:</strong> Al Instrument Detection (Camera or Gallery), Exercises based on Available Equipments, Time, Preffered Muscle Groups & Custom requests, Dynamic Video Tutorial Finder for each exercise, Super personalized Al generated routine, Workout History Tracker, Easy SignUp/Login with Google Oauth</p>
168
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Technologies used:</strong> Dart, flutter, firebase, gemini 1.5 flash, riverpod, langchain, fastapi, google oauth</p>
169
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Licenses:</strong> This code of this app/website is written from scratch and I hold all the rights over distribution</p>
170
+ <div class="project-links">
171
+ <a href="https://www.youtube.com/shorts/0ZR0IWiZJQE" target="_blank" style="display: inline-block; margin-right: 1rem; font-size: 0.9rem;">1 Minute Video Demo</a>
172
+ <a href="https://play.google.com/store/apps/details?id=com.aigymbuddy.me&hl=en" target="_blank" style="display: inline-block; margin-right: 1rem; font-size: 0.9rem;">
173
+ <img src="https://static-00.iconduck.com/assets.00/google-play-icon-2048x2048-487quz63.png" alt="Google Play Store" class="playstore-icon" style="width: 16px; height: 16px; vertical-align: middle; margin-right: 0.25rem;"> Google Play Store (Android)
174
+ </a>
175
+ <a href="http://www.aigymbuddy.in" target="_blank" style="display: inline-block; margin-right: 1rem; font-size: 0.9rem;">Prototype for WEB www.aigymbuddy.in</a>
176
+ </div>
177
+ <p class="status deployed" style="font-size: 0.85rem; font-weight: bold; margin-top: 0.5rem; color: #a3be8c;">Deployed <i class="fas fa-check-circle"></i></p>
178
+ </div>
179
+
180
+ <div class="project-card" style="border: 1px solid #444; padding: 1.5rem; border-radius: 8px;">
181
+ <h3 style="margin-top: 0; margin-bottom: 0.75rem;">Non-Sequential Breast Cancer Classification System</h3>
182
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Multi-Modal Cancer Detection:</strong> Developed a novel multi-output deep learning model for breast cancer detection, predicting cancer presence, invasiveness, and difficult-negative case status. The model incorporates both mammogram images and tabular clinical data, leveraging a non-sequential architecture to process distinct data modalities.</p>
183
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Fine-Tuned Image Feature Extraction:</strong> Utilized a pre-trained EfficientNetV2B3 model for image feature extraction, fine-tuning layers from block 6 onwards to enhance its applicability to the specific task, thus improving the quality of learned representations and potentially making the model more robust and accurate.</p>
184
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Distributed Training:</strong> Accelerated model training through distributed training using TensorFlow's MirroredStrategy on 2xT4 GPUs for 9 hours on Kaggle, demonstrating proficiency in optimizing model training with limited computational resources.</p>
185
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Technologies used:</strong> TensorFlow, Transfer Learning, EfficientNetV2, Fused MB-CNN</p>
186
+ <div class="project-links">
187
+ <a href="https://debopamparam-bcd-inference-vvyb1v.streamlit.app/" target="_blank" style="display: inline-block; margin-right: 1rem; font-size: 0.9rem;">Live Webapp + Architecture + Training Code + Evaluation Metrics</a>
188
+ </div>
189
+ <p class="status deployed" style="font-size: 0.85rem; font-weight: bold; margin-top: 0.5rem; color: #a3be8c;">Deployed <i class="fas fa-check-circle"></i></p>
190
+ </div>
191
+
192
+ <div class="project-card" style="border: 1px solid #444; padding: 1.5rem; border-radius: 8px;">
193
+ <h3 style="margin-top: 0; margin-bottom: 0.75rem;">Image Entity Extraction with Qwen2 VL: Large-Scale Inference</h3>
194
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Problem Statement:</strong> E-commerce and healthcare industries struggle to efficiently extract product details (weight, volume, dimensions) from images at scale.</p>
195
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Action:</strong> Developed a large-scale image-to-text inference pipeline using Qwen2 VL: 2B, incorporating image preprocessing, Regex, and parallel processing. Processed 84,000 of 131,000 test images.</p>
196
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Result:</strong> Successfully extracted product values from a significant portion of the dataset. Our team of four ranked 172nd out of ~75,000 in the Amazon ML Challenge with Fl-Score=0.47, demonstrating the solution's potential for automated product information extraction.</p>
197
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Technologies used:</strong> Qwen2 VL, Python, Regex, Parallel Processing</p>
198
+ <div class="project-links">
199
+ <a href="https://colab.research.google.com/drive/1V5F1XMlYNHzv-hA9xmIJ-Jx5vuD0fKAR?usp=sharing" target="_blank" style="display: inline-block; margin-right: 1rem; font-size: 0.9rem;">Click Here to see the code</a>
200
+ </div>
201
+ </div>
202
+
203
+ <div class="project-card" style="border: 1px solid #444; padding: 1.5rem; border-radius: 8px;">
204
+ <h3 style="margin-top: 0; margin-bottom: 0.75rem;">LLM based ATS System using VertexAI Embedding</h3>
205
+ <p style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>Technologies used:</strong> Langchain, VertexAI Embedding, StreamLit, PostGresVector</p>
206
+ <p class="status ongoing" style="font-size: 0.85rem; font-weight: bold; margin-top: 0.5rem; color: #f9bb6d;">Ongoing</p>
207
+ </div>
208
+ </div>
209
+ </section>
210
+
211
+ <section class="volunteering" style="padding: 2rem; margin-bottom: 1.5rem; border-radius: 8px;">
212
+ <h2>Volunteering</h2>
213
+ <div class="volunteering-item" style="margin-bottom: 1rem;">
214
+ <h3>Git/GitHub Instructor (Volunteer)</h3>
215
+ <p style="color: #ccc;">Carried out sessions to teach juniors the fundamentals of Git and GitHub, covering version control, collaboration, and best practices.</p>
216
+ </div>
217
+ <div class="volunteering-item" style="margin-bottom: 1rem;">
218
+ <h3>Event Coordinator</h3>
219
+ <p style="color: #ccc;">Acharya Technical Club - Steigen</p>
220
+ </div>
221
+ </section>
222
+
223
+ <section class="achievements" style="padding: 2rem; margin-bottom: 1.5rem; border-radius: 8px;">
224
+ <h2>Achievements</h2>
225
+ <ul style="padding-left: 20px;">
226
+ <li style="margin-bottom: 0.75rem; font-size: 0.95rem;">Secured rank 172 out of 75000 participants in AMAZON ML Challenge Hackathon 2024.</li>
227
+ <li style="margin-bottom: 0.75rem; font-size: 0.95rem;">2nd place out of 60 in the TechnioD Hackathon.</li>
228
+ <li style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong><a href="https://github.com/AnswerDotAI/byaldi/pull/50" target="_blank" style="color: #89b4fa;">Open - Source Contribution - Byaldi - 575☆</a></strong>
229
+ <ul style="padding-left: 20px;">
230
+ <li style="margin-bottom: 0.5rem; font-size: 0.95rem;">Fix langchain integration not present in pypi tar & whl-- pyproject.toml</li>
231
+ </ul>
232
+ </li>
233
+ <li style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>NON TECHNICAL :</strong>
234
+ <ul style="padding-left: 20px;">
235
+ <li style="margin-bottom: 0.5rem; font-size: 0.95rem;">Performed in IIT BOMBAY'S Mood Indigo Bengaluru Event - Finalist.</li>
236
+ </ul>
237
+ </li>
238
+ </ul>
239
+ </section>
240
+
241
+ <section class="certifications" style="padding: 2rem; margin-bottom: 1.5rem; border-radius: 8px;">
242
+ <h2>Certifications</h2>
243
+ <ul style="padding-left: 20px;">
244
+ <li style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>COURSERA:</strong> Advanced Learning Algorithms | <a href="https://coursera.org/share/d540318c8cb4a7e802d8c4964a471d34" target="_blank" style="color: #89b4fa;">View Certificate</a></li>
245
+ <li style="margin-bottom: 0.75rem; font-size: 0.95rem;"><strong>COURSERA:</strong> Supervised Machine Learning: Regression and Classification | <a href="https://coursera.org/share/d540318c8cb4a7e802d8c4964a471d34" target="_blank" style="color: #89b4fa;">View Certificate</a></li>
246
+ </ul>
247
+ </section>
248
+
249
+ <section class="education" style="padding: 2rem; margin-bottom: 1.5rem; border-radius: 8px;">
250
+ <h2>Education</h2>
251
+ <div class="education-item" style="margin-bottom: 0.75rem; font-size: 0.95rem;">
252
+ <h3>BE in Information Science</h3>
253
+ <p class="institution" style="color: #ccc;">Acharya Institute of Technology, Bangalore</p>
254
+ <p class="duration" style="font-size: 0.9rem; color: #999;">2021-2025</p>
255
+ <p>CGPA-8.12</p>
256
+ </div>
257
+ <div class="education-item" style="margin-bottom: 0.75rem; font-size: 0.95rem;">
258
+ <h3>Higher Secondary Education</h3>
259
+ <p class="institution" style="color: #ccc;">Kalyani Public School, Barasat, Kolkata</p>
260
+ <p class="duration" style="font-size: 0.9rem; color: #999;">2021</p>
261
+ <p>77% (Auto Pass Covid Batch)</p>
262
+ </div>
263
+ <div class="education-item" style="margin-bottom: 0.75rem; font-size: 0.95rem;">
264
+ <h3>Secondary Education</h3>
265
+ <p class="institution" style="color: #ccc;">Sacred Heart Day High School, Kolkata</p>
266
+ <p class="duration" style="font-size: 0.9rem; color: #999;">2019</p>
267
+ <p>90%</p>
268
+ </div>
269
+ </section>
270
+
271
+ <footer style="text-align: center; padding: 1rem 0; font-size: 0.9rem;">
272
+ <p>© 2024 Debopam Chowdhury (Param)</p>
273
+ </footer>
274
+ </body>
275
+ </html>"""
276
+ )
requirements.txt ADDED
Binary file (180 Bytes). View file
 
static/database_scema.txt ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## The Tables in the Default Database are:
2
+ `customers`, `order_items`, `orders`, `payments`, and `products`.
3
+
4
+ ### **customers**
5
+ | | customer_id | customer_zip_code_prefix | customer_city | customer_state |
6
+ |------:|:--------------|---------------------------:|:----------------|:-----------------|
7
+ | 21921 | 0tgYlOTGgpO6 | 79230 | russas | CE |
8
+ | 9748 | jGhRQF3CIew4 | 81460 | joao monlevade | MG |
9
+ | 22679 | 1UutQTIhBvcP | 94480 | pelotas | RS |
10
+
11
+ Rows: 38279, Columns: 4
12
+
13
+ ```sql
14
+ CREATE TABLE customers (
15
+ customer_id VARCHAR(255) PRIMARY KEY,
16
+ customer_zip_code_prefix INT,
17
+ customer_city VARCHAR(255),
18
+ customer_state VARCHAR(2)
19
+ );
20
+ ```
21
+
22
+ ### **order_items**
23
+ | | order_id | product_id | seller_id | price | shipping_charges |
24
+ |------:|:-------------|:-------------|:-------------|--------:|-------------------:|
25
+ | 19729 | PDEzZdebLSn3 | aBpYjaBcwz6e | bzfcwRPnZzVO | 55.83 | 27.8 |
26
+ | 6001 | R7bIPjjYqlHP | ZM2JJXV5m9hl | Ivbw25fb5t2Z | 100 | 42.05 |
27
+ | 282 | Biqo21nETaMO | XqmdGKRbTetH | P2nCHWuo0HC0 | 113.49 | 91.32 |
28
+
29
+ Rows: 38279, Columns: 5
30
+
31
+ ```sql
32
+ CREATE TABLE order_items (
33
+ order_id VARCHAR(255),
34
+ product_id VARCHAR(255),
35
+ seller_id VARCHAR(255),
36
+ price DECIMAL(10, 2),
37
+ shipping_charges DECIMAL(10, 2),
38
+ PRIMARY KEY (order_id, product_id),
39
+ FOREIGN KEY (order_id) REFERENCES orders(order_id),
40
+ FOREIGN KEY (product_id) REFERENCES products(product_id)
41
+ );
42
+ ```
43
+
44
+ ### **orders**
45
+
46
+ | | order_id | customer_id | order_purchase_timestamp | order_approved_at |
47
+ |------:|:-------------|:--------------|:---------------------------|:--------------------|
48
+ | 7294 | PMqwQc01iDTJ | c9ueC6k6V5WS | 2018-06-19 21:23:48 | 2018-06-20 08:38:30 |
49
+ | 13800 | P4l8R2Qat5n7 | ovKkGaXi5TmN | 2018-01-05 08:26:03 | 2018-01-05 08:47:20 |
50
+ | 17679 | NxIseZjAQCdC | o9qzmUQVJOxA | 2018-01-28 23:46:53 | 2018-01-28 23:58:31 |
51
+
52
+ Rows: 38279, Columns: 4
53
+
54
+ ```sql
55
+ CREATE TABLE orders (
56
+ order_id VARCHAR(255) PRIMARY KEY,
57
+ customer_id VARCHAR(255),
58
+ order_purchase_timestamp TIMESTAMP,
59
+ order_approved_at TIMESTAMP,
60
+ FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
61
+ );
62
+ ```
63
+
64
+ ### **payments**
65
+ | | order_id | payment_sequential | payment_type | payment_installments | payment_value |
66
+ |------:|:-------------|---------------------:|:---------------|-----------------------:|----------------:|
67
+ | 35526 | cQXl0pQtiMad | 1 | wallet | 1 | 172.58 |
68
+ | 35799 | olImD2k316Gz | 1 | credit_card | 3 | 16.78 |
69
+ | 13278 | G9MJYXXtPZSz | 1 | credit_card | 10 | 221.86 |
70
+
71
+ Rows: 38279, Columns: 5
72
+
73
+ ```sql
74
+ CREATE TABLE payments (
75
+ order_id VARCHAR(255),
76
+ payment_sequential INT,
77
+ payment_type VARCHAR(50),
78
+ payment_installments INT,
79
+ payment_value DECIMAL(10, 2),
80
+ PRIMARY KEY (order_id, payment_sequential),
81
+ FOREIGN KEY (order_id) REFERENCES orders(order_id)
82
+ );
83
+ ```
84
+
85
+ ### **products**
86
+ | | product_id | product_category_name | product_weight_g | product_length_cm | product_height_cm | product_width_cm |
87
+ |------:|:-------------|:------------------------|-------------------:|--------------------:|--------------------:|-------------------:|
88
+ | 18191 | hpiXwRzTkhkL | bed_bath_table | 1150 | 40 | 9 | 50 |
89
+ | 2202 | iPoRkE7dkmlc | toys | 15800 | 38 | 62 | 57 |
90
+ | 27442 | hrjNaMt3Wyo5 | toys | 1850 | 37 | 22 | 40 |
91
+
92
+ Rows: 38279, Columns: 6
93
+
94
+ ```sql
95
+ CREATE TABLE products (
96
+ product_id VARCHAR(255) PRIMARY KEY,
97
+ product_category_name VARCHAR(255),
98
+ product_weight_g INT,
99
+ product_length_cm INT,
100
+ product_height_cm INT,
101
+ product_width_cm INT
102
+ );
103
+ ```
static/default_questions.txt ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ These questions are generated by ChatGpt 4o. Copy and paste the questions in the ChatBox - run it - get the results - compare the results yourself.
2
+ > You can also upload your own files, to get your schemas. You can then use these schemas to cross check your Answer by ChatGpt 4os with any bigger LLM models. You can cross-check directly with our - Manual SQL Executer😊.
3
+ - Ask Questions
4
+ - Run Queries: automatic + manual
5
+ - Download Results
6
+
7
+ ### Easy Questions
8
+
9
+ 1.
10
+ **Question:**
11
+ ```
12
+ Retrieve all customer IDs and their corresponding cities from the `customers` table.
13
+ ```
14
+ **Fine-Tuned Model Results:**
15
+ ✅ **Pass**
16
+ **Answer by ChatGpt 4o:**
17
+ ```sql
18
+ SELECT customer_id, customer_city FROM customers;
19
+ ```
20
+
21
+ ---
22
+
23
+ 2.
24
+ **Question:**
25
+ ```
26
+ List all products along with their category names from the `products` table.
27
+ ```
28
+ **Fine-Tuned Model Results:**
29
+ ✅ **Pass**
30
+ **Answer by ChatGpt 4o:**
31
+ ```sql
32
+ SELECT product_id, product_category_name FROM products;
33
+ ```
34
+
35
+ ---
36
+
37
+ 3.
38
+ **Question:**
39
+ ```
40
+ Fetch the order IDs and their purchase timestamps from the `orders` table.
41
+ ```
42
+ **Fine-Tuned Model Results:**
43
+ ✅ **Pass**
44
+ **Answer by ChatGpt 4o:**
45
+ ```sql
46
+ SELECT order_id, order_purchase_timestamp FROM orders;
47
+ ```
48
+
49
+ ---
50
+
51
+ 4.
52
+ **Question:**
53
+ ```
54
+ Display the distinct payment types available in the `payments` table.
55
+ ```
56
+ **Fine-Tuned Model Results:**
57
+ ✅ **Pass**
58
+ **Answer by ChatGpt 4o:**
59
+ ```sql
60
+ SELECT DISTINCT payment_type FROM payments;
61
+ ```
62
+
63
+ ---
64
+
65
+ 5.
66
+ **Question:**
67
+ ```
68
+ Find the total number of rows in the `customers` table.
69
+ ```
70
+ **Fine-Tuned Model Results:**
71
+ ✅ **Pass**
72
+ **Answer by ChatGpt 4o:**
73
+ ```sql
74
+ SELECT COUNT(*) AS total_customers FROM customers;
75
+ ```
76
+
77
+ ---
78
+
79
+ ### Medium Questions
80
+
81
+ 1.
82
+ **Question:**
83
+ ```
84
+ Retrieve the total payment value for each order from the `payments` table, grouped by `order_id`.
85
+ ```
86
+ **Fine-Tuned Model Results:**
87
+ ✅ **Pass**
88
+ **Answer by ChatGpt 4o:**
89
+ ```sql
90
+ SELECT order_id, SUM(payment_value) AS total_payment
91
+ FROM payments
92
+ GROUP BY order_id;
93
+ ```
94
+
95
+ ---
96
+
97
+ 2.
98
+ **Question:**
99
+ ```
100
+ Find all orders where the total shipping charges (sum of `shipping_charges`) exceed 100.
101
+ ```
102
+ **Fine-Tuned Model Results:**
103
+ ❌ **Fail**
104
+ **Answer by ChatGpt 4o:**
105
+ ```sql
106
+ SELECT order_id
107
+ FROM order_items
108
+ GROUP BY order_id
109
+ HAVING SUM(shipping_charges) > 100;
110
+ ```
111
+ **Issue:** Missing validation for null or non-existent data.
112
+
113
+ ---
114
+
115
+ 3.
116
+ **Question:**
117
+ ```
118
+ List the names of cities and the number of customers in each city, sorted in descending order of the number of customers.
119
+ ```
120
+ **Fine-Tuned Model Results:**
121
+ ✅ **Pass**
122
+ **Answer by ChatGpt 4o:**
123
+ ```sql
124
+ SELECT customer_city, COUNT(*) AS customer_count
125
+ FROM customers
126
+ GROUP BY customer_city
127
+ ORDER BY customer_count DESC;
128
+ ```
129
+
130
+ ---
131
+
132
+ ### Hard Questions
133
+
134
+ 1.
135
+ **Question:**
136
+ ```
137
+ Write a Query to find the total revenue (sum of `price` + `shipping_charges`) generated for each product category in the `order_items` table, joined with the `products` table.
138
+ ```
139
+ **Fine-Tuned Model Results:**
140
+ ✅ **Pass**
141
+ **Answer by ChatGpt 4o:**
142
+ ```sql
143
+ SELECT
144
+ p.product_category_name,
145
+ SUM(o.price + o.shipping_charges) AS total_revenue
146
+ FROM order_items o
147
+ JOIN products p ON o.product_id = p.product_id
148
+ GROUP BY p.product_category_name
149
+ ORDER BY total_revenue DESC;
150
+ ```
151
+
152
+ ---
153
+
154
+ 2.
155
+ **Question:**
156
+ ```
157
+ Write a Query to identify the top 5 products with the highest total sales value ( sum of `price` ) across all orders.
158
+ ```
159
+ **Fine-Tuned Model Results:**
160
+ ❌ **Fail**
161
+ **Answer by ChatGpt 4o:**
162
+ ```sql
163
+ SELECT
164
+ product_id,
165
+ SUM(price) AS total_sales
166
+ FROM order_items
167
+ GROUP BY product_id
168
+ ORDER BY total_sales DESC
169
+ LIMIT 5;
170
+ ```
171
+ **Issue:** Misalignment with finer-grained filters or lack of handling for tied ranks.
172
+
173
+ ---
static/df_Customers.csv ADDED
The diff for this file is too large to render. See raw diff
 
static/df_OrderItems.csv ADDED
The diff for this file is too large to render. See raw diff
 
static/df_Orders.csv ADDED
The diff for this file is too large to render. See raw diff
 
static/df_Payments.csv ADDED
The diff for this file is too large to render. See raw diff
 
static/df_Products.csv ADDED
The diff for this file is too large to render. See raw diff
 
utils/__init__.py ADDED
File without changes
utils/handle_sql_commands.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # handle_sql_commands.py
2
+ import duckdb
3
+ import pandas as pd
4
+ import streamlit as st
5
+
6
+
7
+ def execute_sql_duckdb(sql_command: str, dataframes: dict) -> pd.DataFrame or None:
8
+ try:
9
+ con = duckdb.connect(database=":memory:", read_only=False)
10
+ for df_name, df in dataframes.items():
11
+ con.register(df_name, df)
12
+ result_df = con.execute(sql_command).fetchdf()
13
+ con.close()
14
+ return result_df
15
+ except duckdb.Error as e:
16
+ st.error(f"DuckDB Error: {e}")
17
+ return None
18
+
utils/llm_logic.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # llm_logic.py
2
+ # from langchain_ollama import ChatOllama
3
+ from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
4
+ import streamlit as st
5
+ import multiprocessing
6
+ from langchain_community.chat_models import ChatLlamaCpp
7
+
8
+ local_model = "qwen2.5-coder-3b-instruct-q4_k_m.gguf"
9
+
10
+ stop = [
11
+ "<|image_pad|>",
12
+ "<|endoftext|>",
13
+ "<|quad_end|>",
14
+ "<|object_ref_end|>",
15
+ "<|object_ref_start|>",
16
+ "<|file_sep|>",
17
+ "<|repo_name|>",
18
+ "<|PAD_TOKEN|>",
19
+ "<|quad_start|>",
20
+ "<|box_start|>",
21
+ "<|box_end|>",
22
+ "<|im_start|>",
23
+ "</tool_call>",
24
+ "<|video_pad|>",
25
+ "<tool_call>",
26
+ "<|im_end|>",
27
+ "<|vision_",
28
+ "<|fim_",
29
+ ]
30
+
31
+
32
+ def get_llm():
33
+ cache_llm = ChatLlamaCpp(
34
+ temperature=0.0,
35
+ model_path=local_model,
36
+ n_ctx=10000,
37
+ n_gpu_layers=0,
38
+ n_batch=1024,
39
+ max_tokens=500,
40
+ n_threads=multiprocessing.cpu_count() - 1,
41
+ top_p=0.97,
42
+ verbose=False,
43
+ stop=stop,
44
+ )
45
+ return cache_llm
46
+
47
+
48
+ llm = get_llm()
49
+
50
+
51
+ db_schema = """### **customers**
52
+ | | customer_id | customer_zip_code_prefix | customer_city | customer_state |
53
+ |------:|:--------------|---------------------------:|:----------------|:-----------------|
54
+ | 21921 | 0tgYlOTGgpO6 | 79230 | russas | CE |
55
+ | 9748 | jGhRQF3CIew4 | 81460 | joao monlevade | MG |
56
+ | 22679 | 1UutQTIhBvcP | 94480 | pelotas | RS |
57
+
58
+ Rows: 38279, Columns: 4
59
+
60
+ ---
61
+
62
+ ### **order_items**
63
+ | | order_id | product_id | seller_id | price | shipping_charges |
64
+ |------:|:-------------|:-------------|:-------------|--------:|-------------------:|
65
+ | 19729 | PDEzZdebLSn3 | aBpYjaBcwz6e | bzfcwRPnZzVO | 55.83 | 27.8 |
66
+ | 6001 | R7bIPjjYqlHP | ZM2JJXV5m9hl | Ivbw25fb5t2Z | 100 | 42.05 |
67
+ | 282 | Biqo21nETaMO | XqmdGKRbTetH | P2nCHWuo0HC0 | 113.49 | 91.32 |
68
+
69
+ Rows: 38279, Columns: 5
70
+
71
+ ---
72
+
73
+ ### **orders**
74
+
75
+ | | order_id | customer_id | order_purchase_timestamp | order_approved_at |
76
+ |------:|:-------------|:--------------|:---------------------------|:--------------------|
77
+ | 7294 | PMqwQc01iDTJ | c9ueC6k6V5WS | 2018-06-19 21:23:48 | 2018-06-20 08:38:30 |
78
+ | 13800 | P4l8R2Qat5n7 | ovKkGaXi5TmN | 2018-01-05 08:26:03 | 2018-01-05 08:47:20 |
79
+ | 17679 | NxIseZjAQCdC | o9qzmUQVJOxA | 2018-01-28 23:46:53 | 2018-01-28 23:58:31 |
80
+
81
+ Rows: 38279, Columns: 4
82
+
83
+ ---
84
+
85
+ ### **payments**
86
+ | | order_id | payment_sequential | payment_type | payment_installments | payment_value |
87
+ |------:|:-------------|---------------------:|:---------------|-----------------------:|----------------:|
88
+ | 35526 | cQXl0pQtiMad | 1 | wallet | 1 | 172.58 |
89
+ | 35799 | olImD2k316Gz | 1 | credit_card | 3 | 16.78 |
90
+ | 13278 | G9MJYXXtPZSz | 1 | credit_card | 10 | 221.86 |
91
+
92
+ Rows: 38279, Columns: 5
93
+
94
+ ---
95
+
96
+ ### **products**
97
+ | | product_id | product_category_name | product_weight_g | product_length_cm | product_height_cm | product_width_cm |
98
+ |------:|:-------------|:------------------------|-------------------:|--------------------:|--------------------:|-------------------:|
99
+ | 18191 | hpiXwRzTkhkL | bed_bath_table | 1150 | 40 | 9 | 50 |
100
+ | 2202 | iPoRkE7dkmlc | toys | 15800 | 38 | 62 | 57 |
101
+ | 27442 | hrjNaMt3Wyo5 | toys | 1850 | 37 | 22 | 40 |
102
+
103
+ Rows: 38279, Columns: 6
104
+ """
105
+
106
+ # Improved SQL generation prompt
107
+ sql_system_prompt = """You are a highly skilled natural language to SQL translator. Your goal is to generate accurate SQL queries based on the provided database schema. You must only return the SQL query and no other text or explanations.
108
+
109
+ DATABASE SCHEMA:
110
+ {db_schema}
111
+ """
112
+ sql_chat_template = """
113
+
114
+ Translate the following natural language question into an accurate SQL query. Return only the SQL query.
115
+
116
+ QUESTION: {question}
117
+
118
+ ### assistant:
119
+
120
+ """
121
+
122
+ # Improved prompt for classifying the question
123
+ classification_system_prompt = """You are an expert at classifying user questions as requiring a SQL query or being generic based on the provided database schema. Your response should be ONLY 'SQL' or 'GENERIC'.
124
+
125
+ A question requires a SQL query if it asks for specific data that can be retrieved from the tables in the schema. A question is generic if it asks for explanations, definitions, or information not directly retrievable through a SQL query on the given schema.
126
+
127
+ Consider the following database schema:
128
+ {db_schema}
129
+
130
+ Here are some examples:
131
+
132
+ Question: What are the names of all customers?
133
+ Response: SQL
134
+
135
+ Question: Tell me about the sales table.
136
+ Response: GENERIC
137
+
138
+ Question: How much did product 'Product A' sell for?
139
+ Response: SQL
140
+
141
+ Question: What is a primary key?
142
+ Response: GENERIC
143
+ """
144
+ classification_chat_template = """
145
+
146
+ Determine if the following question requires a SQL query based on the database schema. Respond with 'SQL' or 'GENERIC'.
147
+
148
+ QUESTION: {question}
149
+
150
+ ### assistant:
151
+ """
152
+
153
+
154
+ def classify_question(question: str, use_default_schema: bool = True):
155
+ classification_system_prompt_local = classification_system_prompt # Initialize here
156
+ if use_default_schema:
157
+ classification_system_prompt_local = classification_system_prompt_local.format(
158
+ db_schema=db_schema
159
+ )
160
+ else:
161
+ uploaded_schema = st.session_state.uploaded_df_schema
162
+ classification_system_prompt_local = classification_system_prompt_local.format(
163
+ db_schema=uploaded_schema
164
+ )
165
+ classification_messages = [
166
+ SystemMessage(content=classification_system_prompt_local),
167
+ HumanMessage(content=classification_chat_template.format(question=question)),
168
+ ]
169
+ response = llm.invoke(classification_messages)
170
+ return response.content.strip().upper()
171
+
172
+
173
+ def generate_llm_response(prompt: str, use_default_schema: bool = True):
174
+ question_type = classify_question(prompt, use_default_schema)
175
+ chosen_schema = None
176
+ if use_default_schema:
177
+ chosen_schema = db_schema
178
+ sql_system_prompt_local = sql_system_prompt.format(db_schema=chosen_schema)
179
+ else:
180
+ uploaded_schema = st.session_state.uploaded_df_schema
181
+ chosen_schema = uploaded_schema
182
+ sql_system_prompt_local = sql_system_prompt.format(db_schema=chosen_schema)
183
+
184
+ # Retrieve the chat history from the session state
185
+ chat_history = st.session_state.get("chat_history", [])
186
+
187
+ if "SQL" in question_type:
188
+ print("SQL question detected")
189
+ st.toast("Detected Task: SQL Query Generation", icon="🚨")
190
+ formatted_prompt = sql_chat_template.format(question=prompt)
191
+
192
+ # Create the messages list, including the system prompt and the chat history
193
+ messages_for_llm = [SystemMessage(content=sql_system_prompt_local)]
194
+ for message in chat_history:
195
+ if isinstance(message, HumanMessage):
196
+ messages_for_llm.append(HumanMessage(content=message.content))
197
+ elif isinstance(message, AIMessage):
198
+ # Only include the assistant's text response, not the additional kwargs
199
+ messages_for_llm.append(AIMessage(content=message.content))
200
+ messages_for_llm.append(HumanMessage(content=formatted_prompt))
201
+
202
+ full_response = ""
203
+ for chunk in llm.stream(messages_for_llm):
204
+ full_response += chunk.content
205
+ yield f"<sql>\n```sql\n{full_response.strip()}\n```\n</sql>"
206
+ elif "GENERIC" in question_type:
207
+ print("Generic question detected")
208
+ st.toast("Detected Task: Generic QA", icon="🚨")
209
+ generic_prompt = f"Answer the following question related to SQL or coding:\n\nQUESTION: {prompt}\n\n### assistant:"
210
+
211
+ # Create the messages list, including the system prompt and the chat history
212
+ messages_for_generic = [
213
+ SystemMessage(
214
+ content=f"You are a helpful assistant finetuned from Qwen2.5-coder:3B-Instruct for answering questions about SQL.\nYou have a database with the Database Schema:\n{chosen_schema}.\n"
215
+ )
216
+ ]
217
+ for message in chat_history:
218
+ if isinstance(message, HumanMessage):
219
+ messages_for_generic.append(HumanMessage(content=message.content))
220
+ elif isinstance(message, AIMessage):
221
+ # Only include the assistant's text response, not the additional kwargs
222
+ messages_for_generic.append(AIMessage(content=message.content))
223
+ messages_for_generic.append(HumanMessage(content=generic_prompt))
224
+
225
+ generic_response = ""
226
+ for chunk in llm.stream(messages_for_generic):
227
+ generic_response += chunk.content
228
+ yield generic_response
229
+ else:
230
+ yield "I am sorry, I am small language model fine-tuned specifically to answer questions that can be solved using SQL. I won't be able to answer this question."
utils/sql_utils.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import streamlit as st
3
+ import pandas as pd
4
+
5
+ def extract_sql_command(text):
6
+ """
7
+ Extracts the SQL command enclosed within ```sql ``` delimiters from a given string.
8
+
9
+ Args:
10
+ text: The input string containing the SQL command.
11
+
12
+ Returns:
13
+ The extracted SQL command as a string, or None if no SQL command is found.
14
+ """
15
+ pattern = r"```sql\s*([\s\S]*?)\s*```"
16
+ match = re.search(pattern, text)
17
+ if match:
18
+ return match.group(1).strip()
19
+ else:
20
+ return None
21
+
22
+ def create_schema(dataframes):
23
+ schema = ""
24
+ for df_name, df in dataframes.items():
25
+ schema += f"### {df_name}\n"
26
+ schema += df.head(3).to_markdown(index=False)
27
+ schema += "\n\nRows: " + str(df.shape[0]) + ", Columns: " + str(df.shape[1]) + "\n\n---\n\n"
28
+ return schema
29
+
30
+
31
+ @st.cache_resource
32
+ def load_defaultdb_schema_text():
33
+ with open("static/database_scema.txt", "r", encoding="utf-8") as file:
34
+ return file.read()
35
+
36
+
37
+ @st.cache_resource
38
+ def load_defaultdb_queries():
39
+ with open("static/default_questions.txt", "r", encoding="utf-8") as file:
40
+ return file.read()
41
+
42
+
43
+ @st.cache_data
44
+ def convert_df(df):
45
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
46
+ return df.to_csv().encode("utf-8")
47
+
48
+
49
+ # Load CSV files into pandas default_dfs
50
+ @st.cache_resource
51
+ def load_data():
52
+ # text-to-sql-streamlit\static\df_Customers.csv
53
+ df_customers = pd.read_csv("static/df_Customers.csv")
54
+ df_order_items = pd.read_csv("static/df_OrderItems.csv")
55
+ df_orders = pd.read_csv("static/df_Orders.csv")
56
+ df_payments = pd.read_csv("static/df_Payments.csv")
57
+ df_products = pd.read_csv("static/df_Products.csv")
58
+ return {
59
+ "customers": df_customers,
60
+ "order_items": df_order_items,
61
+ "orders": df_orders,
62
+ "payments": df_payments,
63
+ "products": df_products,
64
+ }
🤖SQL_Agent.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import streamlit as st
3
+ from langchain_core.messages import HumanMessage, AIMessage
4
+ from utils.llm_logic import generate_llm_response
5
+ from utils.sql_utils import (
6
+ extract_sql_command,
7
+ load_defaultdb_schema_text,
8
+ load_defaultdb_queries,
9
+ load_data,
10
+ )
11
+ from utils.handle_sql_commands import execute_sql_duckdb
12
+
13
+
14
+ st.set_page_config(
15
+ page_title="Text-to-SQL Agent",
16
+ page_icon="🤖",
17
+ layout="wide",
18
+ initial_sidebar_state="expanded",
19
+ )
20
+
21
+ default_db_questions = {
22
+ "easy": [
23
+ "Retrieve all customer IDs and their corresponding cities from the `customers` table.",
24
+ "List all products along with their category names from the `products` table.",
25
+ "Fetch the order IDs and their purchase timestamps from the `orders` table.",
26
+ "Display the distinct payment types available in the `payments` table.",
27
+ "Find the total number of rows in the `customers` table.",
28
+ ],
29
+ "medium": [
30
+ "Retrieve the total payment value for each order from the `payments` table, grouped by `order_id`.",
31
+ "Find all orders where the total shipping charges (sum of `shipping_charges`) exceed 100.",
32
+ "List the names of cities and the number of customers in each city, sorted in descending order of the number of customers.",
33
+ ],
34
+ "hard": [
35
+ "Write a query to find the total revenue (sum of `price` + `shipping_charges`) generated for each product category in the `order_items` table, joined with the `products` table.",
36
+ "Identify the top 5 products with the highest total sales value (sum of `price`) across all orders.",
37
+ ],
38
+ }
39
+
40
+
41
+ default_dfs = load_data()
42
+ selected_df = default_dfs
43
+ use_default_schema = True
44
+
45
+ st.markdown(
46
+ """
47
+ <style>
48
+ /* Base styles for both themes */
49
+ .stPageLink {
50
+ background-image: linear-gradient(to right, #007BFF, #6610F2); /* Gradient background */
51
+ color: white !important; /* Ensure text is readable on the gradient */
52
+ padding: 12px 20px !important; /* Slightly larger padding */
53
+ border-radius: 8px !important; /* More rounded corners */
54
+ border: none !important; /* Remove default border */
55
+ text-decoration: none !important;
56
+ font-weight: 500 !important; /* Slightly lighter font weight */
57
+ transition: transform 0.2s ease-in-out, box-shadow 0.2s ease-in-out; /* Smooth transitions */
58
+ box-shadow: 0 2px 5px rgba(0, 0, 0, 0.15); /* Subtle shadow for depth */
59
+ display: inline-flex;
60
+ align-items: center;
61
+ justify-content: center;
62
+ }
63
+
64
+ .stPageLink:hover {
65
+ transform: scale(1.03); /* Slight scale up on hover */
66
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2); /* Increased shadow on hover */
67
+ }
68
+
69
+ .stPageLink span { /* Style the label text */
70
+ margin-left: 5px; /* Space between icon and text */
71
+ }
72
+
73
+ /* Dark theme adjustments (optional, if needed for better contrast) */
74
+ /* Consider using Streamlit's theme variables if possible for a more robust solution */
75
+ /* For simplicity, this example uses fixed colors that should work reasonably well */
76
+ /* [data-theme="dark"] .stPageLink {
77
+ }
78
+
79
+ [data-theme="dark"] .stPageLink:hover {
80
+ } */
81
+ </style>
82
+ """,
83
+ unsafe_allow_html=True,
84
+ )
85
+
86
+ with st.popover("Click here to see Database Schema", use_container_width=True):
87
+ uploaded_df_schema = st.session_state.get("uploaded_df_schema", False)
88
+
89
+ choice = st.segmented_control(
90
+ "Choose",
91
+ ["Default DB", "Uploaded Files"],
92
+ label_visibility="collapsed",
93
+ disabled=uploaded_df_schema == False,
94
+ default="Default DB" if uploaded_df_schema == False else "Uploaded Files",
95
+ )
96
+
97
+ if uploaded_df_schema is False:
98
+ st.markdown(
99
+ """> You can also upload your own files, to get your schemas. You can then use those schemas to cross-check our answers with ChatGpt/Gemini/Claude (Preferred if the Question is very Complex). You can run the queries directly with our Manual SQL Executer😊.
100
+ - Ask Questions
101
+ - Run Queries: automatic + manual
102
+ - Download Results """
103
+ )
104
+ st.page_link(
105
+ page="pages/3 📂File Upload for SQL.py",
106
+ label="Upload your own CSV or Excel files",
107
+ icon="📜",
108
+ )
109
+ schema = load_defaultdb_schema_text()
110
+ st.markdown(schema, unsafe_allow_html=True)
111
+ elif choice == "Default DB":
112
+ schema = load_defaultdb_schema_text()
113
+ st.markdown(schema, unsafe_allow_html=True)
114
+ else:
115
+ pretty_schema, markdown = st.tabs(["Schema", "Copy Schema in Markdown"])
116
+ with pretty_schema:
117
+ st.info(
118
+ "You can copy this schema, and give it to any state of the art LLM models like (Gemini /ChatGPT /Claude etc) to cross check your answers.\n You can run the queries directly here, by using ***Manual Query Executer*** in the sidebar and download your results 😊",
119
+ icon="ℹ️",
120
+ )
121
+ st.markdown(uploaded_df_schema, unsafe_allow_html=True)
122
+ with markdown:
123
+ st.info(
124
+ "You can copy this schema, and give it to any state of the art LLM models like (Gemini /ChatGPT /Claude etc) to cross check your answers.\n You can run the queries directly here, by using ***Manual Query Executer*** in the sidebar and download your results 😊",
125
+ icon="ℹ️",
126
+ )
127
+ st.markdown(f"```\n{uploaded_df_schema}\n```")
128
+
129
+
130
+ col1, col2 = st.columns([2, 1], vertical_alignment="bottom")
131
+ with col1:
132
+ st.header("Natural Language to SQL Query Agent🤖")
133
+
134
+ with col2:
135
+ st.caption("> ***Execute on the Go!*** 🚀 In-Built DuckDB Execution Engine")
136
+
137
+ st.caption(
138
+ "This is a Qwen2.5-Coder-3B model fine-tuned for SQL queries integrated with langchain for Agentic Workflow. To see the Fine-Tuning code - [click here](https://www.kaggle.com/code/debopamchowdhury/qwen-2-5coder-3b-instruct-finetuning)."
139
+ )
140
+
141
+
142
+ col1, col2 = st.columns([2, 1], vertical_alignment="bottom")
143
+ with col1:
144
+ # Button to refresh the conversation
145
+ if st.button("Start New Conversation", type="primary"):
146
+ st.session_state.chat_history = []
147
+ st.session_state.conversation_turns = 0
148
+ st.rerun()
149
+ with col2:
150
+ disabled_selection = True
151
+ if (
152
+ "uploaded_dataframes" in st.session_state
153
+ ) and st.session_state.uploaded_dataframes:
154
+ disabled_selection = False
155
+ options = ["default_db", "uploaded_files"]
156
+ selected = st.segmented_control(
157
+ "Choose",
158
+ options,
159
+ selection_mode="single",
160
+ disabled=disabled_selection,
161
+ label_visibility="collapsed",
162
+ default="default_db" if disabled_selection else "uploaded_files",
163
+ )
164
+ if not disabled_selection:
165
+ if selected == "uploaded_files":
166
+ selected_df = st.session_state.uploaded_dataframes
167
+ print(selected_df)
168
+ use_default_schema = False
169
+ else:
170
+ selected_df = default_dfs
171
+ print(selected_df)
172
+ use_default_schema = True
173
+ if selected_df == default_dfs:
174
+ with st.popover("Default Database Queries 📚 - Trial"):
175
+ default_db_questions = load_defaultdb_queries()
176
+ st.markdown(default_db_questions)
177
+
178
+ # Initialize chat history in session state
179
+ if "chat_history" not in st.session_state:
180
+ st.session_state.chat_history = []
181
+
182
+ # Initialize conversation turn counter
183
+ if "conversation_turns" not in st.session_state:
184
+ st.session_state.conversation_turns = 0
185
+
186
+ # Set the maximum number of conversation turns
187
+ MAX_TURNS = 5
188
+
189
+ # Display existing chat messages
190
+ for message in st.session_state.chat_history:
191
+ with st.chat_message(message.type):
192
+ st.markdown(message.content)
193
+ if (
194
+ isinstance(message, AIMessage)
195
+ and "response_df" in message.additional_kwargs
196
+ and message.additional_kwargs["response_df"] is not None
197
+ and not message.additional_kwargs["response_df"].empty
198
+ ):
199
+ with st.expander("View SQL-Query Execution Result"):
200
+ df = message.additional_kwargs["response_df"]
201
+ # download_csv = convert_df(df)
202
+ # st.download_button(
203
+ # label="Download data as CSV",
204
+ # data=download_csv,
205
+ # file_name="query_results.csv",
206
+ # mime="text/csv",
207
+ # )
208
+ # renderer = StreamlitRenderer(
209
+ # df,
210
+ # spec_io_mode="rw",
211
+ # default_tab="data",
212
+ # appearance="dark",
213
+ # kernel_computation=True,
214
+ # )
215
+ # renderer.explorer(default_tab="data")
216
+ st.dataframe(df)
217
+ st.info(f"Rows x Columns: {df.shape[0]} x {df.shape[1]}")
218
+ st.subheader("Data Description:")
219
+ st.markdown(df.describe().T.to_markdown())
220
+ st.subheader("Data Types:")
221
+ st.write(df.dtypes)
222
+
223
+ # Get user input only if the conversation turn limit is not reached
224
+ if st.session_state.conversation_turns < MAX_TURNS:
225
+ if prompt := st.chat_input("Ask me a SQL query question"):
226
+ # Add user message to chat history in session state
227
+ st.session_state.chat_history.append(HumanMessage(content=prompt))
228
+ # Display user message in chat
229
+ with st.chat_message("user"):
230
+ st.markdown(prompt)
231
+
232
+ duckdb_result = None
233
+ # Get assistant response with streaming
234
+ with st.chat_message("assistant"):
235
+ message_placeholder = st.empty()
236
+ full_response = ""
237
+ with st.spinner(
238
+ "I know it is taking a lot of time. To run the model I'm using `Free` small vCPUs provided by `HuggingFace Spaces` for deployment. Thank you so much for your patience😊"
239
+ ):
240
+ for response_so_far in generate_llm_response(
241
+ prompt, use_default_schema
242
+ ):
243
+ # Remove <sql> and </sql> tags for streaming display
244
+ streaming_response = response_so_far.replace("<sql>", "").replace(
245
+ "</sql>", ""
246
+ )
247
+ # Remove duplicate ```sql tags with or without space for streaming display
248
+ streaming_response = re.sub(
249
+ r"```sql\s*```sql", "```sql", streaming_response
250
+ )
251
+ message_placeholder.markdown(streaming_response + "▌")
252
+ full_response = response_so_far
253
+
254
+ # Remove <sql> and </sql> tags from the full response
255
+ full_response = full_response.replace("<sql>", "").replace("</sql>", "")
256
+ # Remove duplicate ```sql tags with or without space from the full response
257
+ full_response = re.sub(r"```sql\s*```sql", "```sql", full_response)
258
+ # Remove trailing duplicate ``` tags from the full response
259
+ full_response = re.sub(r"[\s\n]*`+$", "```", full_response)
260
+ message_placeholder.markdown(full_response)
261
+ # st.text(extract_sql_command(full_response))
262
+
263
+ sql_command = extract_sql_command(full_response)
264
+ # dataframe_html = None
265
+ if sql_command:
266
+ # st.text("Extracted SQL Command:")
267
+ # st.code(sql_command, language="sql")
268
+ duckdb_result = execute_sql_duckdb(sql_command, selected_df)
269
+ if duckdb_result is not None:
270
+ st.text("Query Execution Result:")
271
+ with st.expander("View Result"):
272
+ # st.dataframe(duckdb_result)
273
+ st.dataframe(duckdb_result)
274
+ st.info(
275
+ f"Rows x Columns: {duckdb_result.shape[0]} x {duckdb_result.shape[1]}"
276
+ )
277
+ st.subheader("Data Description:")
278
+ st.markdown(duckdb_result.describe().T.to_markdown())
279
+ st.subheader("Data Types:")
280
+ st.write(duckdb_result.dtypes)
281
+ # renderer = StreamlitRenderer(
282
+ # duckdb_result,
283
+ # spec_io_mode="rw",
284
+ # default_tab="data",
285
+ # appearance="dark",
286
+ # kernel_computation=True,
287
+ # )
288
+ # renderer.explorer(default_tab="data")
289
+
290
+ else:
291
+ # st.warning("No SQL command found in the response.")
292
+ pass
293
+
294
+ # Add assistant response to chat history in session state
295
+ st.session_state.chat_history.append(
296
+ AIMessage(
297
+ content=full_response,
298
+ additional_kwargs={"response_df": duckdb_result},
299
+ )
300
+ )
301
+
302
+ # Increment the conversation turn counter
303
+ st.session_state.conversation_turns += 1
304
+ else:
305
+ st.warning(
306
+ "Maximum number of questions reached. Please click 'Start New Conversation' to continue."
307
+ )
308
+ st.chat_input(
309
+ "Ask me a SQL query question", disabled=True
310
+ ) # Disable the input field