Ervinoreo
commited on
Commit
·
e9e516f
1
Parent(s):
dab98a9
firsst
Browse files- .gitignore +109 -0
- app.py +632 -0
- rag_system.py +460 -0
- requirements.txt +14 -0
- requirements_clean.txt +11 -0
- sample_documents/sample_university_requirements.txt +177 -0
- start.sh +43 -0
- test_system.py +192 -0
.gitignore
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
*.so
|
6 |
+
.Python
|
7 |
+
build/
|
8 |
+
develop-eggs/
|
9 |
+
dist/
|
10 |
+
downloads/
|
11 |
+
eggs/
|
12 |
+
.eggs/
|
13 |
+
lib/
|
14 |
+
lib64/
|
15 |
+
parts/
|
16 |
+
sdist/
|
17 |
+
var/
|
18 |
+
wheels/
|
19 |
+
pip-wheel-metadata/
|
20 |
+
share/python-wheels/
|
21 |
+
*.egg-info/
|
22 |
+
.installed.cfg
|
23 |
+
*.egg
|
24 |
+
MANIFEST
|
25 |
+
|
26 |
+
# Virtual Environment
|
27 |
+
.venv/
|
28 |
+
.env/
|
29 |
+
venv/
|
30 |
+
ENV/
|
31 |
+
env/
|
32 |
+
.venv
|
33 |
+
|
34 |
+
# Environment Variables
|
35 |
+
.env
|
36 |
+
.env.local
|
37 |
+
.env.development.local
|
38 |
+
.env.test.local
|
39 |
+
.env.production.local
|
40 |
+
|
41 |
+
# IDE
|
42 |
+
.vscode/
|
43 |
+
.idea/
|
44 |
+
*.swp
|
45 |
+
*.swo
|
46 |
+
*~
|
47 |
+
|
48 |
+
# macOS
|
49 |
+
.DS_Store
|
50 |
+
.AppleDouble
|
51 |
+
.LSOverride
|
52 |
+
|
53 |
+
# Windows
|
54 |
+
Thumbs.db
|
55 |
+
ehthumbs.db
|
56 |
+
Desktop.ini
|
57 |
+
|
58 |
+
# Jupyter Notebooks
|
59 |
+
.ipynb_checkpoints
|
60 |
+
|
61 |
+
# AI/ML specific
|
62 |
+
chroma_db/
|
63 |
+
chromadb/
|
64 |
+
*.db
|
65 |
+
*.sqlite
|
66 |
+
*.sqlite3
|
67 |
+
|
68 |
+
# Document storage
|
69 |
+
documents/
|
70 |
+
uploaded_documents/
|
71 |
+
temp_documents/
|
72 |
+
|
73 |
+
# Query results and cache
|
74 |
+
query_results/
|
75 |
+
.cache/
|
76 |
+
.streamlit/
|
77 |
+
|
78 |
+
# Model downloads and cache
|
79 |
+
models/
|
80 |
+
.transformers_cache/
|
81 |
+
.huggingface/
|
82 |
+
sentence_transformers_cache/
|
83 |
+
|
84 |
+
# Logs
|
85 |
+
*.log
|
86 |
+
logs/
|
87 |
+
.logs/
|
88 |
+
|
89 |
+
# Temporary files
|
90 |
+
tmp/
|
91 |
+
temp/
|
92 |
+
.tmp/
|
93 |
+
|
94 |
+
# Coverage reports
|
95 |
+
htmlcov/
|
96 |
+
.tox/
|
97 |
+
.coverage
|
98 |
+
.coverage.*
|
99 |
+
.cache
|
100 |
+
nosetests.xml
|
101 |
+
coverage.xml
|
102 |
+
*.cover
|
103 |
+
.hypothesis/
|
104 |
+
.pytest_cache/
|
105 |
+
|
106 |
+
# mypy
|
107 |
+
.mypy_cache/
|
108 |
+
.dmypy.json
|
109 |
+
dmypy.json
|
app.py
ADDED
@@ -0,0 +1,632 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
from urllib.parse import urlparse, parse_qs
|
4 |
+
from rag_system import DocumentIngestion, RAGSystem, save_query_result, load_shared_query
|
5 |
+
from datetime import datetime
|
6 |
+
import uuid
|
7 |
+
|
8 |
+
# Configure Streamlit page
|
9 |
+
st.set_page_config(
|
10 |
+
page_title="PanSea University Search",
|
11 |
+
page_icon="🎓",
|
12 |
+
layout="wide",
|
13 |
+
initial_sidebar_state="expanded"
|
14 |
+
)
|
15 |
+
|
16 |
+
# Custom CSS - Dark theme compatible
|
17 |
+
st.markdown("""
|
18 |
+
<style>
|
19 |
+
.main-header {
|
20 |
+
text-align: center;
|
21 |
+
padding: 2rem 0;
|
22 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
23 |
+
color: white;
|
24 |
+
margin: -1rem -1rem 2rem -1rem;
|
25 |
+
border-radius: 10px;
|
26 |
+
box-shadow: 0 4px 15px 0 rgba(31, 38, 135, 0.37);
|
27 |
+
}
|
28 |
+
|
29 |
+
.stApp {
|
30 |
+
background: var(--background-color);
|
31 |
+
}
|
32 |
+
|
33 |
+
/* Dark theme compatible containers */
|
34 |
+
.query-result {
|
35 |
+
background: rgba(255, 255, 255, 0.05);
|
36 |
+
backdrop-filter: blur(10px);
|
37 |
+
border: 1px solid rgba(255, 255, 255, 0.1);
|
38 |
+
padding: 1.5rem;
|
39 |
+
border-radius: 15px;
|
40 |
+
margin: 1rem 0;
|
41 |
+
color: var(--text-color);
|
42 |
+
}
|
43 |
+
|
44 |
+
.source-doc {
|
45 |
+
background: rgba(31, 119, 180, 0.1);
|
46 |
+
backdrop-filter: blur(5px);
|
47 |
+
padding: 1rem;
|
48 |
+
border-left: 4px solid #1f77b4;
|
49 |
+
border-radius: 8px;
|
50 |
+
margin: 0.5rem 0;
|
51 |
+
color: var(--text-color);
|
52 |
+
}
|
53 |
+
|
54 |
+
.share-link {
|
55 |
+
background: rgba(46, 204, 113, 0.1);
|
56 |
+
backdrop-filter: blur(5px);
|
57 |
+
padding: 1rem;
|
58 |
+
border-radius: 10px;
|
59 |
+
border-left: 4px solid #2ecc71;
|
60 |
+
color: var(--text-color);
|
61 |
+
}
|
62 |
+
|
63 |
+
/* Model indicator boxes */
|
64 |
+
.model-info {
|
65 |
+
background: rgba(52, 152, 219, 0.15);
|
66 |
+
backdrop-filter: blur(10px);
|
67 |
+
padding: 15px;
|
68 |
+
border-radius: 12px;
|
69 |
+
border-left: 4px solid #3498db;
|
70 |
+
margin: 10px 0;
|
71 |
+
}
|
72 |
+
|
73 |
+
/* Language selection enhancement */
|
74 |
+
.language-selection {
|
75 |
+
background: rgba(155, 89, 182, 0.1);
|
76 |
+
backdrop-filter: blur(10px);
|
77 |
+
padding: 15px;
|
78 |
+
border-radius: 12px;
|
79 |
+
border-left: 4px solid #9b59b6;
|
80 |
+
margin: 10px 0;
|
81 |
+
}
|
82 |
+
|
83 |
+
/* Upload area enhancement */
|
84 |
+
.stFileUploader {
|
85 |
+
background: rgba(230, 126, 34, 0.1);
|
86 |
+
backdrop-filter: blur(10px);
|
87 |
+
padding: 20px;
|
88 |
+
border-radius: 15px;
|
89 |
+
border: 2px dashed #e67e22;
|
90 |
+
}
|
91 |
+
|
92 |
+
.stFileUploader label {
|
93 |
+
font-size: 1.2rem;
|
94 |
+
font-weight: bold;
|
95 |
+
color: var(--text-color);
|
96 |
+
}
|
97 |
+
|
98 |
+
/* Button enhancements */
|
99 |
+
.stButton > button {
|
100 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
101 |
+
color: white;
|
102 |
+
border: none;
|
103 |
+
border-radius: 10px;
|
104 |
+
padding: 0.6rem 1.5rem;
|
105 |
+
font-weight: 600;
|
106 |
+
transition: all 0.3s ease;
|
107 |
+
box-shadow: 0 4px 15px 0 rgba(31, 38, 135, 0.37);
|
108 |
+
}
|
109 |
+
|
110 |
+
.stButton > button:hover {
|
111 |
+
transform: translateY(-2px);
|
112 |
+
box-shadow: 0 6px 20px 0 rgba(31, 38, 135, 0.5);
|
113 |
+
}
|
114 |
+
|
115 |
+
/* Sidebar enhancements */
|
116 |
+
.css-1d391kg {
|
117 |
+
background: rgba(255, 255, 255, 0.02);
|
118 |
+
backdrop-filter: blur(10px);
|
119 |
+
}
|
120 |
+
|
121 |
+
/* Info boxes */
|
122 |
+
.stInfo {
|
123 |
+
background: rgba(52, 152, 219, 0.1);
|
124 |
+
backdrop-filter: blur(10px);
|
125 |
+
border-left: 4px solid #3498db;
|
126 |
+
}
|
127 |
+
|
128 |
+
.stSuccess {
|
129 |
+
background: rgba(46, 204, 113, 0.1);
|
130 |
+
backdrop-filter: blur(10px);
|
131 |
+
border-left: 4px solid #2ecc71;
|
132 |
+
}
|
133 |
+
|
134 |
+
.stWarning {
|
135 |
+
background: rgba(241, 196, 15, 0.1);
|
136 |
+
backdrop-filter: blur(10px);
|
137 |
+
border-left: 4px solid #f1c40f;
|
138 |
+
}
|
139 |
+
|
140 |
+
.stError {
|
141 |
+
background: rgba(231, 76, 60, 0.1);
|
142 |
+
backdrop-filter: blur(10px);
|
143 |
+
border-left: 4px solid #e74c3c;
|
144 |
+
}
|
145 |
+
</style>
|
146 |
+
""", unsafe_allow_html=True)
|
147 |
+
|
148 |
+
def main():
|
149 |
+
# Check for shared query in URL
|
150 |
+
query_params = st.query_params
|
151 |
+
shared_query_id = query_params.get("share", [None])[0]
|
152 |
+
|
153 |
+
if shared_query_id:
|
154 |
+
display_shared_query(shared_query_id)
|
155 |
+
return
|
156 |
+
|
157 |
+
# Main header
|
158 |
+
st.markdown("""
|
159 |
+
<div class="main-header">
|
160 |
+
<h1>🎓 PanSea University Search</h1>
|
161 |
+
<p>AI-Powered Study Search Platform for ASEAN Universities</p>
|
162 |
+
</div>
|
163 |
+
""", unsafe_allow_html=True)
|
164 |
+
|
165 |
+
# Sidebar
|
166 |
+
with st.sidebar:
|
167 |
+
st.header("📋 Navigation")
|
168 |
+
page = st.selectbox(
|
169 |
+
"Choose a page:",
|
170 |
+
["🔍 Search Universities", "📄 Upload Documents", "ℹ️ About"]
|
171 |
+
)
|
172 |
+
|
173 |
+
# Show embedding model info
|
174 |
+
st.markdown("---")
|
175 |
+
try:
|
176 |
+
from rag_system import RAGSystem
|
177 |
+
temp_rag = RAGSystem()
|
178 |
+
if hasattr(temp_rag.embeddings, 'model') and temp_rag.embeddings.model:
|
179 |
+
st.markdown("""
|
180 |
+
<div class='language-selection'>
|
181 |
+
<h5 style='margin: 0; color: #9b59b6;'>🔧 Embedding Model</h5>
|
182 |
+
<p style='margin: 5px 0; font-size: 0.9em;'>BGE-small-en-v1.5</p>
|
183 |
+
</div>
|
184 |
+
""", unsafe_allow_html=True)
|
185 |
+
else:
|
186 |
+
st.markdown("""
|
187 |
+
<div class='language-selection'>
|
188 |
+
<h5 style='margin: 0; color: #9b59b6;'>🔧 Embedding Model</h5>
|
189 |
+
<p style='margin: 5px 0; font-size: 0.9em;'>OpenAI Ada-002</p>
|
190 |
+
</div>
|
191 |
+
""", unsafe_allow_html=True)
|
192 |
+
except:
|
193 |
+
pass
|
194 |
+
|
195 |
+
# Main content based on selected page
|
196 |
+
if page == "📄 Upload Documents":
|
197 |
+
upload_documents_page()
|
198 |
+
elif page == "ℹ️ About":
|
199 |
+
about_page()
|
200 |
+
else:
|
201 |
+
search_page()
|
202 |
+
|
203 |
+
def upload_documents_page():
|
204 |
+
st.header("📄 Upload University Documents")
|
205 |
+
st.write("Upload PDF documents containing university admission requirements, fees, and program information.")
|
206 |
+
|
207 |
+
col1, col2 = st.columns(2)
|
208 |
+
|
209 |
+
with col1:
|
210 |
+
university_name = st.text_input("🏫 University Name", placeholder="e.g., National University of Singapore")
|
211 |
+
country = st.selectbox(
|
212 |
+
"🌏 Country",
|
213 |
+
["", "Singapore", "Malaysia", "Thailand", "Indonesia", "Philippines", "Vietnam", "Brunei", "Cambodia", "Laos", "Myanmar"]
|
214 |
+
)
|
215 |
+
|
216 |
+
with col2:
|
217 |
+
document_type = st.selectbox(
|
218 |
+
"📋 Document Type",
|
219 |
+
["admission_requirements", "tuition_fees", "program_catalog", "application_guide", "scholarship_info"]
|
220 |
+
)
|
221 |
+
language = st.selectbox(
|
222 |
+
"🌐 Primary Language",
|
223 |
+
["English", "Chinese", "Malay", "Thai", "Indonesian", "Vietnamese", "Filipino", "Other"]
|
224 |
+
)
|
225 |
+
|
226 |
+
# File upload
|
227 |
+
uploaded_files = st.file_uploader(
|
228 |
+
"Choose PDF files",
|
229 |
+
accept_multiple_files=True,
|
230 |
+
type=['pdf'],
|
231 |
+
help="Select one or more PDF files to upload"
|
232 |
+
)
|
233 |
+
|
234 |
+
if uploaded_files and st.button("🚀 Process Documents", type="primary"):
|
235 |
+
if not university_name or not country:
|
236 |
+
st.error("Please provide university name and country.")
|
237 |
+
return
|
238 |
+
|
239 |
+
with st.spinner("Processing documents... This may take a few minutes."):
|
240 |
+
try:
|
241 |
+
# Initialize document ingestion
|
242 |
+
doc_ingestion = DocumentIngestion()
|
243 |
+
|
244 |
+
# Process documents
|
245 |
+
documents = doc_ingestion.process_documents(
|
246 |
+
uploaded_files, university_name, country, document_type
|
247 |
+
)
|
248 |
+
|
249 |
+
if documents:
|
250 |
+
# Create or update vector store
|
251 |
+
vectorstore = doc_ingestion.create_vector_store(documents)
|
252 |
+
|
253 |
+
if vectorstore:
|
254 |
+
st.success(f"✅ Successfully processed {len(documents)} documents!")
|
255 |
+
st.info(f"Documents from {university_name} ({country}) have been added to the knowledge base.")
|
256 |
+
|
257 |
+
# Show processed files
|
258 |
+
with st.expander("📋 Processed Files"):
|
259 |
+
for doc in documents:
|
260 |
+
st.write(f"• **{doc.metadata['source']}**")
|
261 |
+
st.write(f" - University: {doc.metadata['university']}")
|
262 |
+
st.write(f" - Country: {doc.metadata['country']}")
|
263 |
+
st.write(f" - Type: {doc.metadata['document_type']}")
|
264 |
+
st.write("---")
|
265 |
+
else:
|
266 |
+
st.error("No documents were successfully processed.")
|
267 |
+
|
268 |
+
except Exception as e:
|
269 |
+
st.error(f"Error processing documents: {str(e)}")
|
270 |
+
|
271 |
+
def search_page():
|
272 |
+
st.header("🔍 Search University Information")
|
273 |
+
|
274 |
+
# Language selection
|
275 |
+
col1, col2 = st.columns([3, 1])
|
276 |
+
with col1:
|
277 |
+
st.write("Ask questions about university admissions, requirements, fees, and programs:")
|
278 |
+
with col2:
|
279 |
+
response_language = st.selectbox(
|
280 |
+
"Response Language",
|
281 |
+
["English", "中文 (Chinese)", "Bahasa Malaysia", "ไทย (Thai)", "Bahasa Indonesia", "Tiếng Việt (Vietnamese)"],
|
282 |
+
key="response_language"
|
283 |
+
)
|
284 |
+
|
285 |
+
# Show language info
|
286 |
+
language_map = {
|
287 |
+
"English": "English",
|
288 |
+
"中文 (Chinese)": "Chinese",
|
289 |
+
"Bahasa Malaysia": "Malay",
|
290 |
+
"ไทย (Thai)": "Thai",
|
291 |
+
"Bahasa Indonesia": "Indonesian",
|
292 |
+
"Tiếng Việt (Vietnamese)": "Vietnamese"
|
293 |
+
}
|
294 |
+
selected_lang = language_map.get(response_language, "English")
|
295 |
+
|
296 |
+
if selected_lang != "English":
|
297 |
+
st.info(f"🌐 AI will respond in **{selected_lang}** based on your selection")
|
298 |
+
|
299 |
+
# Example queries with model indicators
|
300 |
+
st.markdown("**💡 Example queries:**")
|
301 |
+
|
302 |
+
# Add model selection explanation
|
303 |
+
st.markdown("""
|
304 |
+
<div class='model-info'>
|
305 |
+
<h4 style='margin: 0; color: #3498db;'>🤖 AI Model Selection</h4>
|
306 |
+
<p style='margin: 5px 0;'><strong>🧠 Reasoning Model (SEA-LION v3.5):</strong> Complex university searches with multiple criteria, comparisons, budget constraints</p>
|
307 |
+
<p style='margin: 5px 0;'><strong>⚡ Instruct Model (SEA-LION v3):</strong> Simple questions, translations, definitions, basic information</p>
|
308 |
+
<p style='margin: 5px 0; font-style: italic;'>The system automatically chooses the best model for your query!</p>
|
309 |
+
</div>
|
310 |
+
""", unsafe_allow_html=True)
|
311 |
+
|
312 |
+
col1, col2 = st.columns(2)
|
313 |
+
|
314 |
+
with col1:
|
315 |
+
st.markdown("**🧠 Complex Queries (Uses Reasoning Model):**")
|
316 |
+
complex_examples = [
|
317 |
+
"Show me universities in Malaysia for master's degrees with tuition under 40,000 RMB per year",
|
318 |
+
"专科毕业,无雅思,想在马来西亚读硕士,学费不超过4万人民币/年",
|
319 |
+
"Compare engineering programs in Thailand and Singapore under $15,000 per year",
|
320 |
+
"Find MBA programs in ASEAN with GMAT requirements and scholarships available"
|
321 |
+
]
|
322 |
+
for example in complex_examples:
|
323 |
+
st.markdown(f"• {example}")
|
324 |
+
|
325 |
+
with col2:
|
326 |
+
st.markdown("**⚡ Simple Queries (Uses Instruct Model):**")
|
327 |
+
simple_examples = [
|
328 |
+
"What does IELTS stand for?",
|
329 |
+
"Translate 'application deadline' to Chinese",
|
330 |
+
"What is the difference between bachelor and master degree?",
|
331 |
+
"How to say 'university' in Thai?"
|
332 |
+
]
|
333 |
+
for example in simple_examples:
|
334 |
+
st.markdown(f"• {example}")
|
335 |
+
|
336 |
+
st.markdown("---") # Separator line
|
337 |
+
|
338 |
+
# Query input - main input field (always available)
|
339 |
+
query = st.text_area(
|
340 |
+
"Your question:",
|
341 |
+
height=100,
|
342 |
+
placeholder="e.g., What are the admission requirements for computer science programs in Singapore?",
|
343 |
+
help="Type your question here or select an example below to get started."
|
344 |
+
)
|
345 |
+
|
346 |
+
# Show search status
|
347 |
+
if query.strip():
|
348 |
+
st.success("✅ Ready to search! Click the search button when you're ready.")
|
349 |
+
else:
|
350 |
+
st.info("💭 Enter your question in the text box above to start searching.")
|
351 |
+
|
352 |
+
# Optional: Quick example selection (just for convenience)
|
353 |
+
with st.expander("💡 Example Queries (Click to Use)"):
|
354 |
+
# Combine all examples
|
355 |
+
all_examples = complex_examples + simple_examples
|
356 |
+
|
357 |
+
col1, col2 = st.columns(2)
|
358 |
+
with col1:
|
359 |
+
st.markdown("**🧠 Complex Examples:**")
|
360 |
+
for example in complex_examples:
|
361 |
+
if st.button(example[:60] + "...", key=f"ex_{hash(example)}", help=f"Click to use: {example}"):
|
362 |
+
st.session_state.example_query = example
|
363 |
+
|
364 |
+
with col2:
|
365 |
+
st.markdown("**⚡ Simple Examples:**")
|
366 |
+
for example in simple_examples:
|
367 |
+
if st.button(example[:60] + "...", key=f"ex_{hash(example)}", help=f"Click to use: {example}"):
|
368 |
+
st.session_state.example_query = example
|
369 |
+
|
370 |
+
# Use selected example if any
|
371 |
+
if hasattr(st.session_state, 'example_query') and st.session_state.example_query:
|
372 |
+
query = st.session_state.example_query
|
373 |
+
st.info(f"📝 Using example: {query[:100]}...")
|
374 |
+
# Clear the example after use
|
375 |
+
del st.session_state.example_query
|
376 |
+
|
377 |
+
# Additional filters
|
378 |
+
with st.expander("🔧 Advanced Filters (Optional)"):
|
379 |
+
col1, col2, col3 = st.columns(3)
|
380 |
+
with col1:
|
381 |
+
budget_range = st.select_slider(
|
382 |
+
"Budget Range (USD/year)",
|
383 |
+
options=["Any", "<10k", "10k-20k", "20k-30k", "30k-40k", ">40k"],
|
384 |
+
value="Any"
|
385 |
+
)
|
386 |
+
with col2:
|
387 |
+
study_level = st.multiselect(
|
388 |
+
"Study Level",
|
389 |
+
["Diploma", "Bachelor", "Master", "PhD"],
|
390 |
+
default=[]
|
391 |
+
)
|
392 |
+
with col3:
|
393 |
+
preferred_countries = st.multiselect(
|
394 |
+
"Preferred Countries",
|
395 |
+
["Singapore", "Malaysia", "Thailand", "Indonesia", "Philippines", "Vietnam", "Brunei"],
|
396 |
+
default=[]
|
397 |
+
)
|
398 |
+
|
399 |
+
# Search button - enabled as soon as there's text in the query
|
400 |
+
search_disabled = not query.strip()
|
401 |
+
button_text = "🔍 Search" if not search_disabled else "🔍 Search (Enter a question first)"
|
402 |
+
|
403 |
+
if st.button(button_text, type="primary", disabled=search_disabled):
|
404 |
+
if not query.strip():
|
405 |
+
st.error("Please enter a question.")
|
406 |
+
return
|
407 |
+
|
408 |
+
# Get the language code for processing
|
409 |
+
language_map = {
|
410 |
+
"English": "English",
|
411 |
+
"中文 (Chinese)": "Chinese",
|
412 |
+
"Bahasa Malaysia": "Malay",
|
413 |
+
"ไทย (Thai)": "Thai",
|
414 |
+
"Bahasa Indonesia": "Indonesian",
|
415 |
+
"Tiếng Việt (Vietnamese)": "Vietnamese"
|
416 |
+
}
|
417 |
+
language_code = language_map.get(response_language, "English")
|
418 |
+
|
419 |
+
with st.spinner("Searching for information..."):
|
420 |
+
try:
|
421 |
+
# Initialize RAG system
|
422 |
+
rag_system = RAGSystem()
|
423 |
+
|
424 |
+
# Show which model will be used
|
425 |
+
from rag_system import classify_query_type
|
426 |
+
query_type = classify_query_type(query)
|
427 |
+
|
428 |
+
if query_type == "complex":
|
429 |
+
st.info("🧠 **Using SEA-LION Reasoning Model (v3.5)** - Complex query detected")
|
430 |
+
else:
|
431 |
+
st.info("⚡ **Using SEA-LION Instruct Model (v3)** - Simple query/translation detected")
|
432 |
+
|
433 |
+
# Show translation status if not English
|
434 |
+
if response_language != "English":
|
435 |
+
st.info(f"🌐 **Translating response to {response_language}**")
|
436 |
+
|
437 |
+
# Add filters to query if specified
|
438 |
+
enhanced_query = query
|
439 |
+
if budget_range != "Any" or study_level or preferred_countries:
|
440 |
+
filters = []
|
441 |
+
if budget_range != "Any":
|
442 |
+
filters.append(f"budget range: {budget_range}")
|
443 |
+
if study_level:
|
444 |
+
filters.append(f"study levels: {', '.join(study_level)}")
|
445 |
+
if preferred_countries:
|
446 |
+
filters.append(f"countries: {', '.join(preferred_countries)}")
|
447 |
+
|
448 |
+
enhanced_query += f"\n\nAdditional filters: {'; '.join(filters)}"
|
449 |
+
|
450 |
+
# Get response
|
451 |
+
result = rag_system.query(enhanced_query, language_code)
|
452 |
+
|
453 |
+
if result:
|
454 |
+
# Save query result for sharing
|
455 |
+
save_query_result(result)
|
456 |
+
|
457 |
+
# Display results
|
458 |
+
display_query_result(result, show_share_link=True)
|
459 |
+
else:
|
460 |
+
st.error("No results found. Try rephrasing your question or upload more documents.")
|
461 |
+
|
462 |
+
except Exception as e:
|
463 |
+
st.error(f"Error searching: {str(e)}")
|
464 |
+
|
465 |
+
def display_query_result(result, show_share_link=False):
|
466 |
+
"""Display query results in a formatted way."""
|
467 |
+
st.markdown('<div class="query-result">', unsafe_allow_html=True)
|
468 |
+
|
469 |
+
# Show which model was used
|
470 |
+
if result.get("model_used"):
|
471 |
+
st.info(f"🤖 **Model Used:** {result['model_used']}")
|
472 |
+
|
473 |
+
st.subheader("🎯 Answer")
|
474 |
+
st.write(result["answer"])
|
475 |
+
|
476 |
+
# Share link
|
477 |
+
if show_share_link and result.get("query_id"):
|
478 |
+
st.markdown("---")
|
479 |
+
current_url = st.get_option("browser.serverAddress") or "localhost:8501"
|
480 |
+
share_url = f"http://{current_url}?share={result['query_id']}"
|
481 |
+
st.markdown(f"""
|
482 |
+
<div class="share-link">
|
483 |
+
<strong>🔗 Share this result:</strong><br>
|
484 |
+
<code>{share_url}</code>
|
485 |
+
</div>
|
486 |
+
""", unsafe_allow_html=True)
|
487 |
+
|
488 |
+
if st.button("📋 Copy Share Link"):
|
489 |
+
st.code(share_url)
|
490 |
+
|
491 |
+
# Source documents
|
492 |
+
if result.get("source_documents"):
|
493 |
+
st.markdown("---")
|
494 |
+
st.subheader("📚 Sources")
|
495 |
+
for i, doc in enumerate(result["source_documents"], 1):
|
496 |
+
with st.expander(f"Source {i}: {doc.metadata.get('source', 'Unknown')}"):
|
497 |
+
col1, col2 = st.columns([1, 2])
|
498 |
+
with col1:
|
499 |
+
st.write(f"**University:** {doc.metadata.get('university', 'Unknown')}")
|
500 |
+
st.write(f"**Country:** {doc.metadata.get('country', 'Unknown')}")
|
501 |
+
st.write(f"**Type:** {doc.metadata.get('document_type', 'Unknown')}")
|
502 |
+
with col2:
|
503 |
+
st.write("**Relevant Content:**")
|
504 |
+
content_preview = doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content
|
505 |
+
st.write(content_preview)
|
506 |
+
|
507 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
508 |
+
|
509 |
+
def display_shared_query(query_id):
|
510 |
+
"""Display a shared query result."""
|
511 |
+
st.header("🔗 Shared Query Result")
|
512 |
+
|
513 |
+
result_data = load_shared_query(query_id)
|
514 |
+
|
515 |
+
if result_data:
|
516 |
+
st.info(f"**Original Question:** {result_data['question']}")
|
517 |
+
st.write(f"**Language:** {result_data['language']}")
|
518 |
+
st.write(f"**Date:** {result_data['timestamp'][:10]}")
|
519 |
+
|
520 |
+
# Create a mock result object for display
|
521 |
+
mock_result = {
|
522 |
+
"answer": result_data["answer"],
|
523 |
+
"source_documents": [
|
524 |
+
type('MockDoc', (), {
|
525 |
+
'metadata': source,
|
526 |
+
'page_content': source.get('content_preview', '')
|
527 |
+
})() for source in result_data.get('sources', [])
|
528 |
+
]
|
529 |
+
}
|
530 |
+
|
531 |
+
display_query_result(mock_result, show_share_link=False)
|
532 |
+
|
533 |
+
if st.button("🔍 Ask Your Own Question"):
|
534 |
+
st.experimental_set_query_params()
|
535 |
+
st.experimental_rerun()
|
536 |
+
else:
|
537 |
+
st.error("❌ Shared query not found or has expired.")
|
538 |
+
if st.button("🏠 Go to Home"):
|
539 |
+
st.experimental_set_query_params()
|
540 |
+
st.experimental_rerun()
|
541 |
+
|
542 |
+
def about_page():
|
543 |
+
st.header("ℹ️ About PanSea University Search")
|
544 |
+
|
545 |
+
col1, col2 = st.columns([2, 1])
|
546 |
+
|
547 |
+
with col1:
|
548 |
+
st.markdown("""
|
549 |
+
### 🎯 Problem We Solve
|
550 |
+
|
551 |
+
Prospective students worldwide seeking to study abroad face difficulty finding accurate, up-to-date university admission requirements. Information is scattered across PDFs, brochures, and outdated agency websites. Many waste time applying to unsuitable programs due to missing criteria and pay high agent fees.
|
552 |
+
|
553 |
+
### 💡 Our Solution
|
554 |
+
|
555 |
+
PanSea is an LLM-powered, RAG-based study search platform powered by **SEA-LION models** that ingests official admissions documents from ASEAN universities. Students can query in any ASEAN language and receive:
|
556 |
+
|
557 |
+
- 📋 **Ranked program matches** with detailed requirements
|
558 |
+
- 💰 **Tuition fees and costs**
|
559 |
+
- 📅 **Application deadlines and windows**
|
560 |
+
- 🎓 **Entry requirements and prerequisites**
|
561 |
+
- 📖 **Source citations** from official documents
|
562 |
+
|
563 |
+
### 🤖 AI Models Used
|
564 |
+
|
565 |
+
- **SEA-LION v3.5 Reasoning Model**: For complex university search queries requiring multi-step reasoning
|
566 |
+
- **SEA-LION v3 Instruct Model**: For translation and simple question-answering
|
567 |
+
- **Automatic Model Selection**: The system intelligently chooses the appropriate model based on query complexity
|
568 |
+
|
569 |
+
### 🌏 Supported Languages
|
570 |
+
|
571 |
+
- English
|
572 |
+
- 中文 (Chinese)
|
573 |
+
- Bahasa Malaysia
|
574 |
+
- ไทย (Thai)
|
575 |
+
- Bahasa Indonesia
|
576 |
+
- Tiếng Việt (Vietnamese)
|
577 |
+
- Filipino
|
578 |
+
|
579 |
+
### 🔧 How It Works
|
580 |
+
|
581 |
+
1. **📄 Document Ingestion**: Upload official PDF documents from universities
|
582 |
+
2. **🔍 AI Processing**: Our system processes and indexes the content
|
583 |
+
3. **❓ Natural Language Queries**: Ask questions in your preferred language
|
584 |
+
4. **🎯 Intelligent Answers**: Get relevant, sourced responses
|
585 |
+
5. **🔗 Share Results**: Generate shareable links for your queries
|
586 |
+
""")
|
587 |
+
|
588 |
+
with col2:
|
589 |
+
st.markdown("""
|
590 |
+
### 📊 Features
|
591 |
+
|
592 |
+
✅ **Multi-language support**
|
593 |
+
✅ **PDF document ingestion**
|
594 |
+
✅ **Intelligent search & retrieval**
|
595 |
+
✅ **Source citations**
|
596 |
+
✅ **Shareable query results**
|
597 |
+
✅ **Advanced filtering**
|
598 |
+
✅ **Real-time processing**
|
599 |
+
|
600 |
+
### 🏛️ Target Universities
|
601 |
+
|
602 |
+
- 🇸🇬 Singapore
|
603 |
+
- 🇲🇾 Malaysia
|
604 |
+
- 🇹🇭 Thailand
|
605 |
+
- 🇮🇩 Indonesia
|
606 |
+
- 🇵🇭 Philippines
|
607 |
+
- 🇻🇳 Vietnam
|
608 |
+
- 🇧🇳 Brunei
|
609 |
+
- 🇰🇭 Cambodia
|
610 |
+
- 🇱🇦 Laos
|
611 |
+
- 🇲🇲 Myanmar
|
612 |
+
|
613 |
+
### 🚀 Get Started
|
614 |
+
|
615 |
+
1. Go to **Upload Documents** to add university PDFs
|
616 |
+
2. Use **Search Universities** to ask questions
|
617 |
+
3. Share your results with others!
|
618 |
+
""")
|
619 |
+
|
620 |
+
if __name__ == "__main__":
|
621 |
+
# Check if SEA-LION API key is set
|
622 |
+
if not os.getenv("SEA_LION_API_KEY"):
|
623 |
+
st.error("🚨 SEA-LION API Key not found! Please set your SEA_LION_API_KEY in the .env file.")
|
624 |
+
st.code("SEA_LION_API_KEY=your_api_key_here")
|
625 |
+
st.stop()
|
626 |
+
|
627 |
+
# Check if OpenAI API key is set (needed for embeddings)
|
628 |
+
if not os.getenv("OPENAI_API_KEY") or os.getenv("OPENAI_API_KEY") == "your_openai_api_key_here":
|
629 |
+
st.warning("⚠️ OpenAI API Key not configured properly. You'll need it for document embeddings.")
|
630 |
+
st.info("The system will use SEA-LION models for text generation, but OpenAI for document embeddings.")
|
631 |
+
|
632 |
+
main()
|
rag_system.py
ADDED
@@ -0,0 +1,460 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import uuid
|
3 |
+
import tempfile
|
4 |
+
from typing import List, Optional, Dict, Any
|
5 |
+
import streamlit as st
|
6 |
+
from pathlib import Path
|
7 |
+
import PyPDF2
|
8 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
+
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
10 |
+
from langchain_community.vectorstores import Chroma
|
11 |
+
from langchain.chains import RetrievalQA
|
12 |
+
from langchain_community.document_loaders import PyPDFLoader
|
13 |
+
from langchain.schema import Document
|
14 |
+
from dotenv import load_dotenv
|
15 |
+
import chromadb
|
16 |
+
from datetime import datetime
|
17 |
+
import json
|
18 |
+
import base64
|
19 |
+
from openai import OpenAI
|
20 |
+
import re
|
21 |
+
|
22 |
+
# Load environment variables
|
23 |
+
load_dotenv()
|
24 |
+
|
25 |
+
class AlternativeEmbeddings:
|
26 |
+
"""Alternative embeddings using Sentence Transformers when OpenAI is not available"""
|
27 |
+
|
28 |
+
def __init__(self):
|
29 |
+
try:
|
30 |
+
from sentence_transformers import SentenceTransformer
|
31 |
+
# Use BGE-small-en for better performance
|
32 |
+
self.model = SentenceTransformer('BAAI/bge-small-en-v1.5')
|
33 |
+
self.embedding_size = 384
|
34 |
+
except ImportError:
|
35 |
+
st.error("sentence-transformers not available. Please install it or provide OpenAI API key.")
|
36 |
+
self.model = None
|
37 |
+
|
38 |
+
def embed_documents(self, texts):
|
39 |
+
if not self.model:
|
40 |
+
return []
|
41 |
+
return self.model.encode(texts).tolist()
|
42 |
+
|
43 |
+
def embed_query(self, text):
|
44 |
+
if not self.model:
|
45 |
+
return []
|
46 |
+
return self.model.encode([text])[0].tolist()
|
47 |
+
|
48 |
+
class SEALionLLM:
|
49 |
+
"""Custom LLM class for SEA-LION models"""
|
50 |
+
|
51 |
+
def __init__(self):
|
52 |
+
self.client = OpenAI(
|
53 |
+
api_key=os.getenv("SEA_LION_API_KEY"),
|
54 |
+
base_url=os.getenv("SEA_LION_BASE_URL", "https://api.sea-lion.ai/v1")
|
55 |
+
)
|
56 |
+
|
57 |
+
# Model configurations
|
58 |
+
self.instruct_model = "aisingapore/Gemma-SEA-LION-v3-9B-IT"
|
59 |
+
self.reasoning_model = "aisingapore/Llama-SEA-LION-v3.5-8B-R"
|
60 |
+
|
61 |
+
def _is_complex_query(self, query: str) -> bool:
|
62 |
+
"""Determine if query requires reasoning model or simple instruct model"""
|
63 |
+
# Keywords that indicate complex university search queries
|
64 |
+
complex_keywords = [
|
65 |
+
"university", "admission", "requirement", "tuition", "fee", "program", "course",
|
66 |
+
"degree", "master", "bachelor", "phd", "scholarship", "deadline", "application",
|
67 |
+
"budget", "under", "less than", "below", "compare", "recommend", "suggest",
|
68 |
+
"which", "what are the", "show me", "find me", "search for",
|
69 |
+
# Chinese keywords
|
70 |
+
"大学", "学费", "专业", "硕士", "学士", "博士", "申请", "要求", "奖学金",
|
71 |
+
# Malay keywords
|
72 |
+
"universiti", "yuran", "program", "ijazah", "syarat", "permohonan",
|
73 |
+
# Thai keywords
|
74 |
+
"มหาวิทยาลัย", "ค่าเล่าเรียน", "หลักสูตร", "ปริญญา", "เงื่อนไข",
|
75 |
+
# Indonesian keywords
|
76 |
+
"universitas", "biaya", "kuliah", "program", "sarjana", "persyaratan"
|
77 |
+
]
|
78 |
+
|
79 |
+
# Check for multiple criteria (indicates complex search)
|
80 |
+
criteria_count = 0
|
81 |
+
query_lower = query.lower()
|
82 |
+
|
83 |
+
for keyword in complex_keywords:
|
84 |
+
if keyword.lower() in query_lower:
|
85 |
+
criteria_count += 1
|
86 |
+
|
87 |
+
# Also check for comparison words, numbers, conditions
|
88 |
+
comparison_patterns = [
|
89 |
+
r'under \$?\d+', r'less than \$?\d+', r'below \$?\d+', r'between \$?\d+ and \$?\d+',
|
90 |
+
r'不超过.*元', r'低于.*元', r'少于.*元', # Chinese
|
91 |
+
r'kurang dari', r'di bawah', # Malay/Indonesian
|
92 |
+
r'น้อยกว่า', r'ต่ำกว่า' # Thai
|
93 |
+
]
|
94 |
+
|
95 |
+
for pattern in comparison_patterns:
|
96 |
+
if re.search(pattern, query_lower):
|
97 |
+
criteria_count += 2
|
98 |
+
|
99 |
+
# Complex query if multiple keywords or comparison patterns found
|
100 |
+
return criteria_count >= 2
|
101 |
+
|
102 |
+
def _is_translation_query(self, query: str) -> bool:
|
103 |
+
"""Check if query is primarily for translation"""
|
104 |
+
translation_keywords = [
|
105 |
+
"translate", "translation", "แปล", "翻译", "terjemah", "traduire"
|
106 |
+
]
|
107 |
+
|
108 |
+
query_lower = query.lower()
|
109 |
+
return any(keyword in query_lower for keyword in translation_keywords)
|
110 |
+
|
111 |
+
def generate_response(self, query: str, context: str = "", language: str = "English") -> str:
|
112 |
+
"""Generate response using appropriate SEA-LION model"""
|
113 |
+
|
114 |
+
# Choose model based on query complexity
|
115 |
+
if self._is_translation_query(query) or not self._is_complex_query(query):
|
116 |
+
model = self.instruct_model
|
117 |
+
use_reasoning = False
|
118 |
+
else:
|
119 |
+
model = self.reasoning_model
|
120 |
+
use_reasoning = True
|
121 |
+
|
122 |
+
# Prepare messages
|
123 |
+
system_prompt = f"""You are a helpful assistant specializing in ASEAN university admissions.
|
124 |
+
Respond in {language} unless specifically asked otherwise.
|
125 |
+
|
126 |
+
If provided with context from university documents, use that information to give accurate, specific answers.
|
127 |
+
Always cite your sources when using provided context.
|
128 |
+
|
129 |
+
For complex university search queries, provide:
|
130 |
+
1. Direct answers to the question
|
131 |
+
2. Relevant admission requirements
|
132 |
+
3. Tuition fees (if available)
|
133 |
+
4. Application deadlines (if available)
|
134 |
+
5. Source citations from the documents
|
135 |
+
|
136 |
+
Context: {context}"""
|
137 |
+
|
138 |
+
messages = [
|
139 |
+
{"role": "system", "content": system_prompt},
|
140 |
+
{"role": "user", "content": query}
|
141 |
+
]
|
142 |
+
|
143 |
+
try:
|
144 |
+
if use_reasoning:
|
145 |
+
# Use reasoning model with thinking mode
|
146 |
+
response = self.client.chat.completions.create(
|
147 |
+
model=model,
|
148 |
+
messages=messages,
|
149 |
+
max_tokens=2000,
|
150 |
+
temperature=0.1,
|
151 |
+
extra_body={"thinking_mode": True}
|
152 |
+
)
|
153 |
+
else:
|
154 |
+
# Use instruct model for simpler queries
|
155 |
+
response = self.client.chat.completions.create(
|
156 |
+
model=model,
|
157 |
+
messages=messages,
|
158 |
+
max_tokens=1500,
|
159 |
+
temperature=0.3
|
160 |
+
)
|
161 |
+
|
162 |
+
return response.choices[0].message.content
|
163 |
+
|
164 |
+
except Exception as e:
|
165 |
+
st.error(f"Error with SEA-LION model: {str(e)}")
|
166 |
+
# Fallback to a simple response
|
167 |
+
return f"I apologize, but I encountered an error processing your query. Please try rephrasing your question. Error: {str(e)}"
|
168 |
+
|
169 |
+
def classify_query_type(query: str) -> str:
|
170 |
+
"""Public function to classify query type for UI display"""
|
171 |
+
# Create a temporary SEALionLLM instance just for classification
|
172 |
+
temp_llm = SEALionLLM()
|
173 |
+
|
174 |
+
if temp_llm._is_translation_query(query) or not temp_llm._is_complex_query(query):
|
175 |
+
return "simple"
|
176 |
+
else:
|
177 |
+
return "complex"
|
178 |
+
|
179 |
+
class DocumentIngestion:
|
180 |
+
def __init__(self):
|
181 |
+
# Use BGE embeddings by default for better performance
|
182 |
+
try:
|
183 |
+
self.embeddings = AlternativeEmbeddings()
|
184 |
+
self.embedding_type = "BGE-small-en"
|
185 |
+
if not self.embeddings.model:
|
186 |
+
raise Exception("BGE model not available")
|
187 |
+
except Exception:
|
188 |
+
# Fallback to OpenAI if BGE not available
|
189 |
+
openai_key = os.getenv("OPENAI_API_KEY")
|
190 |
+
if openai_key and openai_key != "placeholder_for_embeddings" and openai_key != "your_openai_api_key_here":
|
191 |
+
try:
|
192 |
+
self.embeddings = OpenAIEmbeddings()
|
193 |
+
self.embedding_type = "OpenAI"
|
194 |
+
except Exception as e:
|
195 |
+
st.error("Both BGE and OpenAI embeddings failed. Please check your setup.")
|
196 |
+
raise e
|
197 |
+
else:
|
198 |
+
st.error("No embedding model available. Please install sentence-transformers or provide OpenAI API key.")
|
199 |
+
raise Exception("No embedding model available")
|
200 |
+
|
201 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
202 |
+
chunk_size=1000,
|
203 |
+
chunk_overlap=200,
|
204 |
+
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
|
205 |
+
)
|
206 |
+
self.persist_directory = os.getenv("CHROMA_PERSIST_DIRECTORY", "./chroma_db")
|
207 |
+
os.makedirs(self.persist_directory, exist_ok=True)
|
208 |
+
|
209 |
+
def extract_text_from_pdf(self, pdf_file) -> str:
|
210 |
+
"""Extract text from uploaded PDF file with multiple fallback methods."""
|
211 |
+
try:
|
212 |
+
# Method 1: Try with PyPDF2 (handles most PDFs including encrypted ones with PyCryptodome)
|
213 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
214 |
+
|
215 |
+
# Check if PDF is encrypted
|
216 |
+
if pdf_reader.is_encrypted:
|
217 |
+
# Try to decrypt with empty password (common for protected but not password-protected PDFs)
|
218 |
+
try:
|
219 |
+
pdf_reader.decrypt("")
|
220 |
+
except Exception:
|
221 |
+
st.warning(f"PDF {pdf_file.name} is password-protected. Please provide an unprotected version.")
|
222 |
+
return ""
|
223 |
+
|
224 |
+
text = ""
|
225 |
+
for page_num, page in enumerate(pdf_reader.pages):
|
226 |
+
try:
|
227 |
+
page_text = page.extract_text()
|
228 |
+
text += page_text + "\n"
|
229 |
+
except Exception as e:
|
230 |
+
st.warning(f"Could not extract text from page {page_num + 1} of {pdf_file.name}: {str(e)}")
|
231 |
+
continue
|
232 |
+
|
233 |
+
if text.strip():
|
234 |
+
return text
|
235 |
+
else:
|
236 |
+
st.warning(f"No extractable text found in {pdf_file.name}. This might be a scanned PDF or image-based document.")
|
237 |
+
return ""
|
238 |
+
|
239 |
+
except Exception as e:
|
240 |
+
error_msg = str(e)
|
241 |
+
if "PyCryptodome" in error_msg:
|
242 |
+
st.error(f"Encryption error with {pdf_file.name}: {error_msg}")
|
243 |
+
st.info("💡 The PDF uses encryption. PyCryptodome has been installed to handle this.")
|
244 |
+
elif "password" in error_msg.lower():
|
245 |
+
st.error(f"Password-protected PDF: {pdf_file.name}")
|
246 |
+
st.info("💡 Please provide an unprotected version of this PDF.")
|
247 |
+
else:
|
248 |
+
st.error(f"Error extracting text from {pdf_file.name}: {error_msg}")
|
249 |
+
return ""
|
250 |
+
|
251 |
+
def process_documents(self, uploaded_files, university_name: str = "",
|
252 |
+
country: str = "", document_type: str = "admission_requirements") -> List[Document]:
|
253 |
+
"""Process uploaded PDF files and convert to documents."""
|
254 |
+
documents = []
|
255 |
+
processed_count = 0
|
256 |
+
failed_count = 0
|
257 |
+
|
258 |
+
st.info(f"📄 Processing {len(uploaded_files)} document(s)...")
|
259 |
+
|
260 |
+
for uploaded_file in uploaded_files:
|
261 |
+
if uploaded_file.type == "application/pdf":
|
262 |
+
st.write(f"🔍 Extracting text from: **{uploaded_file.name}**")
|
263 |
+
|
264 |
+
# Extract text
|
265 |
+
text = self.extract_text_from_pdf(uploaded_file)
|
266 |
+
|
267 |
+
if text.strip():
|
268 |
+
# Create metadata
|
269 |
+
metadata = {
|
270 |
+
"source": uploaded_file.name,
|
271 |
+
"university": university_name,
|
272 |
+
"country": country,
|
273 |
+
"document_type": document_type,
|
274 |
+
"upload_timestamp": datetime.now().isoformat(),
|
275 |
+
"file_id": str(uuid.uuid4())
|
276 |
+
}
|
277 |
+
|
278 |
+
# Create document
|
279 |
+
doc = Document(
|
280 |
+
page_content=text,
|
281 |
+
metadata=metadata
|
282 |
+
)
|
283 |
+
documents.append(doc)
|
284 |
+
processed_count += 1
|
285 |
+
st.success(f"✅ Successfully processed: **{uploaded_file.name}** ({len(text)} characters)")
|
286 |
+
else:
|
287 |
+
failed_count += 1
|
288 |
+
st.warning(f"⚠️ Could not extract text from **{uploaded_file.name}**")
|
289 |
+
else:
|
290 |
+
failed_count += 1
|
291 |
+
st.error(f"❌ Unsupported file type: **{uploaded_file.type}** for {uploaded_file.name}")
|
292 |
+
|
293 |
+
# Summary
|
294 |
+
if processed_count > 0:
|
295 |
+
st.success(f"🎉 Successfully processed **{processed_count}** document(s)")
|
296 |
+
if failed_count > 0:
|
297 |
+
st.warning(f"⚠️ Failed to process **{failed_count}** document(s)")
|
298 |
+
|
299 |
+
return documents
|
300 |
+
|
301 |
+
def create_vector_store(self, documents: List[Document]) -> Chroma:
|
302 |
+
"""Create and persist vector store from documents."""
|
303 |
+
if not documents:
|
304 |
+
st.error("No documents to process")
|
305 |
+
return None
|
306 |
+
|
307 |
+
# Split documents into chunks
|
308 |
+
texts = self.text_splitter.split_documents(documents)
|
309 |
+
|
310 |
+
# Create vector store
|
311 |
+
vectorstore = Chroma.from_documents(
|
312 |
+
documents=texts,
|
313 |
+
embedding=self.embeddings,
|
314 |
+
persist_directory=self.persist_directory
|
315 |
+
)
|
316 |
+
|
317 |
+
return vectorstore
|
318 |
+
|
319 |
+
def load_existing_vectorstore(self) -> Optional[Chroma]:
|
320 |
+
"""Load existing vector store if it exists."""
|
321 |
+
try:
|
322 |
+
vectorstore = Chroma(
|
323 |
+
persist_directory=self.persist_directory,
|
324 |
+
embedding_function=self.embeddings
|
325 |
+
)
|
326 |
+
return vectorstore
|
327 |
+
except Exception as e:
|
328 |
+
st.warning(f"Could not load existing vector store: {str(e)}")
|
329 |
+
return None
|
330 |
+
|
331 |
+
class RAGSystem:
|
332 |
+
def __init__(self):
|
333 |
+
# Initialize embeddings - try BGE first, fallback to OpenAI
|
334 |
+
try:
|
335 |
+
self.embeddings = AlternativeEmbeddings()
|
336 |
+
if not self.embeddings.model:
|
337 |
+
# Fallback to OpenAI if BGE not available
|
338 |
+
self.embeddings = OpenAIEmbeddings()
|
339 |
+
except Exception:
|
340 |
+
# If both fail, use OpenAI as last resort
|
341 |
+
self.embeddings = OpenAIEmbeddings()
|
342 |
+
|
343 |
+
self.sea_lion_llm = SEALionLLM()
|
344 |
+
self.persist_directory = os.getenv("CHROMA_PERSIST_DIRECTORY", "./chroma_db")
|
345 |
+
|
346 |
+
def get_vectorstore(self) -> Optional[Chroma]:
|
347 |
+
"""Get the vector store."""
|
348 |
+
try:
|
349 |
+
vectorstore = Chroma(
|
350 |
+
persist_directory=self.persist_directory,
|
351 |
+
embedding_function=self.embeddings
|
352 |
+
)
|
353 |
+
return vectorstore
|
354 |
+
except Exception as e:
|
355 |
+
st.error(f"Error loading vector store: {str(e)}")
|
356 |
+
return None
|
357 |
+
|
358 |
+
def query(self, question: str, language: str = "English") -> Dict[str, Any]:
|
359 |
+
"""Query the RAG system using SEA-LION models."""
|
360 |
+
vectorstore = self.get_vectorstore()
|
361 |
+
if not vectorstore:
|
362 |
+
return {
|
363 |
+
"answer": "No documents have been ingested yet. Please upload some PDF documents first.",
|
364 |
+
"source_documents": [],
|
365 |
+
"query_id": None
|
366 |
+
}
|
367 |
+
|
368 |
+
try:
|
369 |
+
# Retrieve relevant documents
|
370 |
+
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
|
371 |
+
relevant_docs = retriever.get_relevant_documents(question)
|
372 |
+
|
373 |
+
# Prepare context from retrieved documents
|
374 |
+
context_parts = []
|
375 |
+
for i, doc in enumerate(relevant_docs, 1):
|
376 |
+
source_info = doc.metadata.get('source', 'Unknown')
|
377 |
+
university = doc.metadata.get('university', 'Unknown')
|
378 |
+
country = doc.metadata.get('country', 'Unknown')
|
379 |
+
|
380 |
+
context_parts.append(f"""
|
381 |
+
Document {i} (Source: {source_info}, University: {university}, Country: {country}):
|
382 |
+
{doc.page_content[:500]}...
|
383 |
+
""")
|
384 |
+
|
385 |
+
context = "\n".join(context_parts)
|
386 |
+
|
387 |
+
# Generate response using SEA-LION model
|
388 |
+
answer = self.sea_lion_llm.generate_response(
|
389 |
+
query=question,
|
390 |
+
context=context,
|
391 |
+
language=language
|
392 |
+
)
|
393 |
+
|
394 |
+
# Generate query ID for sharing
|
395 |
+
query_id = str(uuid.uuid4())
|
396 |
+
|
397 |
+
return {
|
398 |
+
"answer": answer,
|
399 |
+
"source_documents": relevant_docs,
|
400 |
+
"query_id": query_id,
|
401 |
+
"original_question": question,
|
402 |
+
"language": language,
|
403 |
+
"model_used": "SEA-LION" + (" Reasoning" if self.sea_lion_llm._is_complex_query(question) else " Instruct")
|
404 |
+
}
|
405 |
+
|
406 |
+
except Exception as e:
|
407 |
+
st.error(f"Error querying system: {str(e)}")
|
408 |
+
return {
|
409 |
+
"answer": f"Error processing your question: {str(e)}",
|
410 |
+
"source_documents": [],
|
411 |
+
"query_id": None
|
412 |
+
}
|
413 |
+
|
414 |
+
def save_query_result(query_result: Dict[str, Any]):
|
415 |
+
"""Save query result for sharing."""
|
416 |
+
if query_result.get("query_id"):
|
417 |
+
results_dir = "query_results"
|
418 |
+
os.makedirs(results_dir, exist_ok=True)
|
419 |
+
|
420 |
+
result_file = f"{results_dir}/{query_result['query_id']}.json"
|
421 |
+
|
422 |
+
# Prepare data for saving (remove non-serializable objects)
|
423 |
+
save_data = {
|
424 |
+
"query_id": query_result["query_id"],
|
425 |
+
"question": query_result.get("original_question", ""),
|
426 |
+
"answer": query_result["answer"],
|
427 |
+
"language": query_result.get("language", "English"),
|
428 |
+
"timestamp": datetime.now().isoformat(),
|
429 |
+
"sources": [
|
430 |
+
{
|
431 |
+
"source": doc.metadata.get("source", "Unknown"),
|
432 |
+
"university": doc.metadata.get("university", "Unknown"),
|
433 |
+
"country": doc.metadata.get("country", "Unknown"),
|
434 |
+
"content_preview": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
|
435 |
+
}
|
436 |
+
for doc in query_result.get("source_documents", [])
|
437 |
+
]
|
438 |
+
}
|
439 |
+
|
440 |
+
try:
|
441 |
+
with open(result_file, 'w', encoding='utf-8') as f:
|
442 |
+
json.dump(save_data, f, indent=2, ensure_ascii=False)
|
443 |
+
return True
|
444 |
+
except Exception as e:
|
445 |
+
st.error(f"Error saving query result: {str(e)}")
|
446 |
+
return False
|
447 |
+
return False
|
448 |
+
|
449 |
+
def load_shared_query(query_id: str) -> Optional[Dict[str, Any]]:
|
450 |
+
"""Load a shared query result."""
|
451 |
+
result_file = f"query_results/{query_id}.json"
|
452 |
+
|
453 |
+
if os.path.exists(result_file):
|
454 |
+
try:
|
455 |
+
with open(result_file, 'r', encoding='utf-8') as f:
|
456 |
+
return json.load(f)
|
457 |
+
except Exception as e:
|
458 |
+
st.error(f"Error loading shared query: {str(e)}")
|
459 |
+
|
460 |
+
return None
|
requirements.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain==0.1.0
|
2 |
+
langchain-community==0.0.10
|
3 |
+
langchain-openai==0.0.5
|
4 |
+
streamlit==1.29.0
|
5 |
+
pypdf2==3.0.1
|
6 |
+
pycryptodome==3.23.0
|
7 |
+
chromadb==0.4.22
|
8 |
+
sentence-transformers==5.1.0
|
9 |
+
faiss-cpu==1.7.4
|
10 |
+
python-dotenv==1.0.0
|
11 |
+
openai==1.6.1
|
12 |
+
tiktoken==0.5.2
|
13 |
+
streamlit-extras==0.3.5
|
14 |
+
watchdog==3.0.0
|
requirements_clean.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
langchain-community
|
3 |
+
langchain-openai
|
4 |
+
streamlit
|
5 |
+
pypdf2
|
6 |
+
chromadb
|
7 |
+
sentence-transformers
|
8 |
+
python-dotenv
|
9 |
+
openai
|
10 |
+
tiktoken
|
11 |
+
streamlit-extras
|
sample_documents/sample_university_requirements.txt
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Sample University Admission Requirements
|
2 |
+
|
3 |
+
## National University of Singapore (NUS) - Computer Science Master's Program
|
4 |
+
|
5 |
+
### Program Overview
|
6 |
+
The Master of Computing (Computer Science) program at NUS is a comprehensive graduate program designed for students seeking advanced knowledge in computer science.
|
7 |
+
|
8 |
+
### Admission Requirements
|
9 |
+
|
10 |
+
#### Academic Requirements
|
11 |
+
- Bachelor's degree in Computer Science, Computer Engineering, or related field
|
12 |
+
- Minimum GPA of 3.5/4.0 or equivalent (Second Class Upper Honours)
|
13 |
+
- Strong background in mathematics and programming
|
14 |
+
|
15 |
+
#### English Proficiency Requirements
|
16 |
+
For international students whose native language is not English:
|
17 |
+
- IELTS: Minimum overall score of 6.5 (no band less than 6.0)
|
18 |
+
- TOEFL iBT: Minimum score of 85 (writing minimum 22)
|
19 |
+
- PTE Academic: Minimum score of 65
|
20 |
+
|
21 |
+
#### Additional Requirements
|
22 |
+
- Statement of Purpose (500-1000 words)
|
23 |
+
- Two letters of recommendation from academic or professional referees
|
24 |
+
- Resume/CV highlighting relevant experience
|
25 |
+
- Portfolio of programming projects (preferred)
|
26 |
+
|
27 |
+
### Tuition Fees (2024-2025 Academic Year)
|
28 |
+
- Singapore Citizens: S$12,500 per year
|
29 |
+
- Singapore Permanent Residents: S$17,500 per year
|
30 |
+
- International Students: S$25,000 per year
|
31 |
+
|
32 |
+
### Application Deadlines
|
33 |
+
- **Priority Round**: November 15, 2024
|
34 |
+
- **Regular Round**: January 31, 2025
|
35 |
+
- **Late Round**: March 15, 2025 (subject to availability)
|
36 |
+
|
37 |
+
### Application Process
|
38 |
+
1. Submit online application through NUS Graduate School portal
|
39 |
+
2. Upload required documents
|
40 |
+
3. Pay application fee of S$50
|
41 |
+
4. Submit by deadline
|
42 |
+
5. Attend interview if shortlisted (February-April)
|
43 |
+
6. Admission results: April-May
|
44 |
+
|
45 |
+
### Program Duration
|
46 |
+
- Full-time: 1.5 years (3 semesters)
|
47 |
+
- Part-time: 2.5 years (5 semesters)
|
48 |
+
|
49 |
+
### Financial Aid
|
50 |
+
- NUS Graduate Research Scholarship available for qualifying students
|
51 |
+
- Teaching assistantships for outstanding applicants
|
52 |
+
- Industry sponsorship opportunities
|
53 |
+
|
54 |
+
### Contact Information
|
55 |
+
- Email: [email protected]
|
56 |
+
- Phone: +65 6516 2492
|
57 |
+
- Website: www.nus.edu.sg/graduateschool
|
58 |
+
|
59 |
+
---
|
60 |
+
|
61 |
+
## University of Malaya (UM) - Engineering Master's Programs
|
62 |
+
|
63 |
+
### Program Overview
|
64 |
+
The Faculty of Engineering offers various Master's degree programs in engineering disciplines.
|
65 |
+
|
66 |
+
### Admission Requirements
|
67 |
+
|
68 |
+
#### Academic Requirements
|
69 |
+
- Bachelor's degree in Engineering or related field with minimum CGPA of 3.0/4.0
|
70 |
+
- For applicants with CGPA below 3.0, relevant work experience of at least 2 years required
|
71 |
+
|
72 |
+
#### English Proficiency Requirements
|
73 |
+
For international students:
|
74 |
+
- IELTS: Minimum overall score of 6.0 (no band less than 5.5)
|
75 |
+
- TOEFL iBT: Minimum score of 80
|
76 |
+
- MUET (Malaysian University English Test): Band 4 minimum
|
77 |
+
|
78 |
+
#### Program-Specific Requirements
|
79 |
+
- **Civil Engineering**: AutoCAD proficiency preferred
|
80 |
+
- **Electrical Engineering**: Basic knowledge of circuit analysis
|
81 |
+
- **Mechanical Engineering**: Thermodynamics and fluid mechanics background
|
82 |
+
|
83 |
+
### Tuition Fees (2024 Academic Year)
|
84 |
+
- Malaysian Citizens: RM 8,000 per year
|
85 |
+
- International Students: RM 15,000 per year
|
86 |
+
- ASEAN Students: RM 12,000 per year (special rate)
|
87 |
+
|
88 |
+
### Application Deadlines
|
89 |
+
- **Main Intake (September)**: April 30, 2024
|
90 |
+
- **Second Intake (February)**: October 31, 2024
|
91 |
+
|
92 |
+
### Scholarships Available
|
93 |
+
- UM Graduate Merit Scholarship (50% tuition fee waiver)
|
94 |
+
- ASEAN Scholarship Program
|
95 |
+
- Industry-sponsored scholarships
|
96 |
+
|
97 |
+
### Living Costs (Estimated per month)
|
98 |
+
- Accommodation: RM 500-800
|
99 |
+
- Food: RM 400-600
|
100 |
+
- Transportation: RM 100-200
|
101 |
+
- Other expenses: RM 200-300
|
102 |
+
- **Total: RM 1,200-1,900 per month**
|
103 |
+
|
104 |
+
### Application Requirements
|
105 |
+
1. Completed application form
|
106 |
+
2. Academic transcripts
|
107 |
+
3. Bachelor's degree certificate
|
108 |
+
4. English proficiency test results
|
109 |
+
5. Two reference letters
|
110 |
+
6. Research proposal (for research-based programs)
|
111 |
+
7. Passport copy
|
112 |
+
8. Passport-sized photographs
|
113 |
+
|
114 |
+
### Contact Information
|
115 |
+
- Email: [email protected]
|
116 |
+
- Phone: +603 7967 3026
|
117 |
+
- Address: Faculty of Engineering, University of Malaya, 50603 Kuala Lumpur, Malaysia
|
118 |
+
|
119 |
+
---
|
120 |
+
|
121 |
+
## Chulalongkorn University - Business Administration Master's (MBA)
|
122 |
+
|
123 |
+
### Program Overview
|
124 |
+
The Chulalongkorn Business School MBA program is Thailand's premier business education program.
|
125 |
+
|
126 |
+
### Admission Requirements
|
127 |
+
|
128 |
+
#### Academic Requirements
|
129 |
+
- Bachelor's degree from accredited institution
|
130 |
+
- Minimum GPA of 2.75/4.0 or equivalent
|
131 |
+
- GMAT score of 500+ (preferred) or GRE equivalent
|
132 |
+
- Minimum 2 years of work experience
|
133 |
+
|
134 |
+
#### English Proficiency Requirements
|
135 |
+
- TOEFL iBT: Minimum score of 79
|
136 |
+
- IELTS: Minimum overall score of 6.5
|
137 |
+
- CU-TEP: Minimum score of 80
|
138 |
+
|
139 |
+
### Tuition Fees (2024-2025)
|
140 |
+
- Full-time MBA: 850,000 THB (approx. USD 25,000) total program
|
141 |
+
- Executive MBA: 1,200,000 THB (approx. USD 35,000) total program
|
142 |
+
|
143 |
+
### Application Deadlines
|
144 |
+
- **Early Admission**: February 15, 2024
|
145 |
+
- **Regular Admission**: April 30, 2024
|
146 |
+
- **Final Round**: June 15, 2024
|
147 |
+
|
148 |
+
### Program Duration
|
149 |
+
- Full-time MBA: 16 months
|
150 |
+
- Executive MBA: 18 months (weekend classes)
|
151 |
+
|
152 |
+
### Scholarships
|
153 |
+
- Merit-based scholarships up to 50% tuition
|
154 |
+
- Corporate sponsorship opportunities
|
155 |
+
- Government scholarships for ASEAN students
|
156 |
+
|
157 |
+
### Application Process
|
158 |
+
1. Online application submission
|
159 |
+
2. Submit required documents
|
160 |
+
3. GMAT/GRE scores
|
161 |
+
4. Personal interview
|
162 |
+
5. Group discussion assessment
|
163 |
+
|
164 |
+
### Career Support
|
165 |
+
- Career counseling services
|
166 |
+
- Industry networking events
|
167 |
+
- Internship placement assistance
|
168 |
+
- Alumni network access
|
169 |
+
|
170 |
+
### Contact Information
|
171 |
+
- Email: [email protected]
|
172 |
+
- Phone: +66 2 218 6601
|
173 |
+
- Website: www.cbs.chula.ac.th
|
174 |
+
|
175 |
+
---
|
176 |
+
|
177 |
+
*This document contains sample admission information for demonstration purposes. Please verify all details with the respective universities before applying.*
|
start.sh
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# PanSea University Search - Startup Script
|
4 |
+
|
5 |
+
echo "🎓 Starting PanSea University Search..."
|
6 |
+
|
7 |
+
# Check if virtual environment exists
|
8 |
+
if [ ! -d ".venv" ]; then
|
9 |
+
echo "❌ Virtual environment not found. Please run setup first."
|
10 |
+
exit 1
|
11 |
+
fi
|
12 |
+
|
13 |
+
# Activate virtual environment
|
14 |
+
source .venv/bin/activate
|
15 |
+
|
16 |
+
# Check if .env file exists
|
17 |
+
if [ ! -f ".env" ]; then
|
18 |
+
echo "⚠️ .env file not found. Please create one with your OpenAI API key."
|
19 |
+
echo "Example:"
|
20 |
+
echo "OPENAI_API_KEY=your_api_key_here"
|
21 |
+
exit 1
|
22 |
+
fi
|
23 |
+
|
24 |
+
# Create necessary directories
|
25 |
+
mkdir -p chroma_db
|
26 |
+
mkdir -p documents
|
27 |
+
mkdir -p query_results
|
28 |
+
|
29 |
+
# Check if required packages are installed
|
30 |
+
echo "🔍 Checking dependencies..."
|
31 |
+
python -c "import streamlit, langchain, chromadb" 2>/dev/null
|
32 |
+
if [ $? -ne 0 ]; then
|
33 |
+
echo "❌ Dependencies not found. Installing..."
|
34 |
+
pip install -r requirements.txt
|
35 |
+
fi
|
36 |
+
|
37 |
+
echo "🚀 Starting Streamlit application..."
|
38 |
+
echo "📱 Open your browser to: http://localhost:8501"
|
39 |
+
echo "🛑 Press Ctrl+C to stop the application"
|
40 |
+
echo ""
|
41 |
+
|
42 |
+
# Start the Streamlit app
|
43 |
+
streamlit run app.py --server.port=8501 --server.address=0.0.0.0
|
test_system.py
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Test script for PanSea University Search System
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
from pathlib import Path
|
9 |
+
|
10 |
+
# Add the project directory to Python path
|
11 |
+
project_dir = Path(__file__).parent
|
12 |
+
sys.path.insert(0, str(project_dir))
|
13 |
+
|
14 |
+
def test_imports():
|
15 |
+
"""Test if all required modules can be imported."""
|
16 |
+
print("🧪 Testing imports...")
|
17 |
+
|
18 |
+
try:
|
19 |
+
import streamlit
|
20 |
+
print("✅ Streamlit imported successfully")
|
21 |
+
except ImportError as e:
|
22 |
+
print(f"❌ Failed to import Streamlit: {e}")
|
23 |
+
return False
|
24 |
+
|
25 |
+
try:
|
26 |
+
import langchain
|
27 |
+
print("✅ LangChain imported successfully")
|
28 |
+
except ImportError as e:
|
29 |
+
print(f"❌ Failed to import LangChain: {e}")
|
30 |
+
return False
|
31 |
+
|
32 |
+
try:
|
33 |
+
import chromadb
|
34 |
+
print("✅ ChromaDB imported successfully")
|
35 |
+
except ImportError as e:
|
36 |
+
print(f"❌ Failed to import ChromaDB: {e}")
|
37 |
+
return False
|
38 |
+
|
39 |
+
try:
|
40 |
+
from rag_system import DocumentIngestion, RAGSystem
|
41 |
+
print("✅ RAG system modules imported successfully")
|
42 |
+
except ImportError as e:
|
43 |
+
print(f"❌ Failed to import RAG system: {e}")
|
44 |
+
return False
|
45 |
+
|
46 |
+
return True
|
47 |
+
|
48 |
+
def test_environment():
|
49 |
+
"""Test environment configuration."""
|
50 |
+
print("\n🔧 Testing environment...")
|
51 |
+
|
52 |
+
# Check if .env file exists
|
53 |
+
env_file = project_dir / ".env"
|
54 |
+
if not env_file.exists():
|
55 |
+
print("⚠️ .env file not found. You'll need to create one with your OpenAI API key.")
|
56 |
+
return False
|
57 |
+
|
58 |
+
# Load environment variables
|
59 |
+
try:
|
60 |
+
from dotenv import load_dotenv
|
61 |
+
load_dotenv()
|
62 |
+
|
63 |
+
# Check SEA-LION API key
|
64 |
+
sea_lion_key = os.getenv("SEA_LION_API_KEY")
|
65 |
+
if not sea_lion_key:
|
66 |
+
print("⚠️ SEA_LION_API_KEY not found in .env file")
|
67 |
+
return False
|
68 |
+
elif sea_lion_key == "your_sea_lion_api_key_here":
|
69 |
+
print("⚠️ Please update SEA_LION_API_KEY in .env file with your actual API key")
|
70 |
+
return False
|
71 |
+
else:
|
72 |
+
print(f"✅ SEA-LION API key configured (length: {len(sea_lion_key)})")
|
73 |
+
|
74 |
+
# Check OpenAI API key (for embeddings)
|
75 |
+
openai_key = os.getenv("OPENAI_API_KEY")
|
76 |
+
if not openai_key or openai_key == "your_openai_api_key_here":
|
77 |
+
print("⚠️ OpenAI API key not properly configured. This is needed for document embeddings.")
|
78 |
+
return False
|
79 |
+
else:
|
80 |
+
print(f"✅ OpenAI API key configured (length: {len(openai_key)})")
|
81 |
+
|
82 |
+
return True
|
83 |
+
|
84 |
+
except ImportError as e:
|
85 |
+
print(f"❌ Failed to load environment: {e}")
|
86 |
+
return False
|
87 |
+
|
88 |
+
def test_directories():
|
89 |
+
"""Test if required directories exist or can be created."""
|
90 |
+
print("\n📁 Testing directories...")
|
91 |
+
|
92 |
+
required_dirs = ["chroma_db", "documents", "query_results"]
|
93 |
+
|
94 |
+
for dir_name in required_dirs:
|
95 |
+
dir_path = project_dir / dir_name
|
96 |
+
try:
|
97 |
+
dir_path.mkdir(exist_ok=True)
|
98 |
+
print(f"✅ Directory '{dir_name}' ready")
|
99 |
+
except Exception as e:
|
100 |
+
print(f"❌ Failed to create directory '{dir_name}': {e}")
|
101 |
+
return False
|
102 |
+
|
103 |
+
return True
|
104 |
+
|
105 |
+
def test_basic_functionality():
|
106 |
+
"""Test basic RAG system functionality."""
|
107 |
+
print("\n⚡ Testing basic functionality...")
|
108 |
+
|
109 |
+
try:
|
110 |
+
from rag_system import DocumentIngestion, SEALionLLM
|
111 |
+
|
112 |
+
# Test document ingestion initialization
|
113 |
+
doc_ingestion = DocumentIngestion()
|
114 |
+
print("✅ DocumentIngestion initialized successfully")
|
115 |
+
|
116 |
+
# Test SEA-LION LLM
|
117 |
+
sea_lion = SEALionLLM()
|
118 |
+
print("✅ SEALionLLM initialized successfully")
|
119 |
+
|
120 |
+
# Test query classification
|
121 |
+
complex_query = "Show me universities in Malaysia for master's degree under 40000 RMB"
|
122 |
+
simple_query = "What does IELTS stand for?"
|
123 |
+
|
124 |
+
is_complex1 = sea_lion._is_complex_query(complex_query)
|
125 |
+
is_complex2 = sea_lion._is_complex_query(simple_query)
|
126 |
+
|
127 |
+
if is_complex1 and not is_complex2:
|
128 |
+
print("✅ Query classification working correctly")
|
129 |
+
else:
|
130 |
+
print(f"⚠️ Query classification may need adjustment (complex: {is_complex1}, simple: {is_complex2})")
|
131 |
+
|
132 |
+
# Test text splitter
|
133 |
+
text = "This is a test document. It contains multiple sentences. Each sentence should be processed correctly."
|
134 |
+
docs = doc_ingestion.text_splitter.split_text(text)
|
135 |
+
if docs:
|
136 |
+
print(f"✅ Text splitter working (created {len(docs)} chunks)")
|
137 |
+
else:
|
138 |
+
print("❌ Text splitter not working")
|
139 |
+
return False
|
140 |
+
|
141 |
+
return True
|
142 |
+
|
143 |
+
except Exception as e:
|
144 |
+
print(f"❌ Basic functionality test failed: {e}")
|
145 |
+
return False
|
146 |
+
|
147 |
+
def main():
|
148 |
+
"""Run all tests."""
|
149 |
+
print("🎓 PanSea University Search - System Test")
|
150 |
+
print("=" * 50)
|
151 |
+
|
152 |
+
tests = [
|
153 |
+
("Import Test", test_imports),
|
154 |
+
("Environment Test", test_environment),
|
155 |
+
("Directory Test", test_directories),
|
156 |
+
("Functionality Test", test_basic_functionality)
|
157 |
+
]
|
158 |
+
|
159 |
+
all_passed = True
|
160 |
+
|
161 |
+
for test_name, test_func in tests:
|
162 |
+
try:
|
163 |
+
result = test_func()
|
164 |
+
if not result:
|
165 |
+
all_passed = False
|
166 |
+
except Exception as e:
|
167 |
+
print(f"❌ {test_name} failed with exception: {e}")
|
168 |
+
all_passed = False
|
169 |
+
|
170 |
+
print("\n" + "=" * 50)
|
171 |
+
if all_passed:
|
172 |
+
print("✅ All tests passed! Your system is ready to use.")
|
173 |
+
print("\nTo start the application, run:")
|
174 |
+
print(" ./start.sh")
|
175 |
+
print("\nOr manually with:")
|
176 |
+
print(" streamlit run app.py")
|
177 |
+
else:
|
178 |
+
print("❌ Some tests failed. Please fix the issues above before running the application.")
|
179 |
+
print("\nCommon solutions:")
|
180 |
+
print("1. Make sure you've activated the virtual environment:")
|
181 |
+
print(" source .venv/bin/activate")
|
182 |
+
print("2. Install dependencies:")
|
183 |
+
print(" pip install -r requirements.txt")
|
184 |
+
print("3. Create .env file with your API keys:")
|
185 |
+
print(" SEA_LION_API_KEY=your_sea_lion_api_key")
|
186 |
+
print(" OPENAI_API_KEY=your_openai_api_key")
|
187 |
+
|
188 |
+
return all_passed
|
189 |
+
|
190 |
+
if __name__ == "__main__":
|
191 |
+
success = main()
|
192 |
+
sys.exit(0 if success else 1)
|