Ervinoreo commited on
Commit
e9e516f
·
1 Parent(s): dab98a9
.gitignore ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ pip-wheel-metadata/
20
+ share/python-wheels/
21
+ *.egg-info/
22
+ .installed.cfg
23
+ *.egg
24
+ MANIFEST
25
+
26
+ # Virtual Environment
27
+ .venv/
28
+ .env/
29
+ venv/
30
+ ENV/
31
+ env/
32
+ .venv
33
+
34
+ # Environment Variables
35
+ .env
36
+ .env.local
37
+ .env.development.local
38
+ .env.test.local
39
+ .env.production.local
40
+
41
+ # IDE
42
+ .vscode/
43
+ .idea/
44
+ *.swp
45
+ *.swo
46
+ *~
47
+
48
+ # macOS
49
+ .DS_Store
50
+ .AppleDouble
51
+ .LSOverride
52
+
53
+ # Windows
54
+ Thumbs.db
55
+ ehthumbs.db
56
+ Desktop.ini
57
+
58
+ # Jupyter Notebooks
59
+ .ipynb_checkpoints
60
+
61
+ # AI/ML specific
62
+ chroma_db/
63
+ chromadb/
64
+ *.db
65
+ *.sqlite
66
+ *.sqlite3
67
+
68
+ # Document storage
69
+ documents/
70
+ uploaded_documents/
71
+ temp_documents/
72
+
73
+ # Query results and cache
74
+ query_results/
75
+ .cache/
76
+ .streamlit/
77
+
78
+ # Model downloads and cache
79
+ models/
80
+ .transformers_cache/
81
+ .huggingface/
82
+ sentence_transformers_cache/
83
+
84
+ # Logs
85
+ *.log
86
+ logs/
87
+ .logs/
88
+
89
+ # Temporary files
90
+ tmp/
91
+ temp/
92
+ .tmp/
93
+
94
+ # Coverage reports
95
+ htmlcov/
96
+ .tox/
97
+ .coverage
98
+ .coverage.*
99
+ .cache
100
+ nosetests.xml
101
+ coverage.xml
102
+ *.cover
103
+ .hypothesis/
104
+ .pytest_cache/
105
+
106
+ # mypy
107
+ .mypy_cache/
108
+ .dmypy.json
109
+ dmypy.json
app.py ADDED
@@ -0,0 +1,632 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from urllib.parse import urlparse, parse_qs
4
+ from rag_system import DocumentIngestion, RAGSystem, save_query_result, load_shared_query
5
+ from datetime import datetime
6
+ import uuid
7
+
8
+ # Configure Streamlit page
9
+ st.set_page_config(
10
+ page_title="PanSea University Search",
11
+ page_icon="🎓",
12
+ layout="wide",
13
+ initial_sidebar_state="expanded"
14
+ )
15
+
16
+ # Custom CSS - Dark theme compatible
17
+ st.markdown("""
18
+ <style>
19
+ .main-header {
20
+ text-align: center;
21
+ padding: 2rem 0;
22
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
23
+ color: white;
24
+ margin: -1rem -1rem 2rem -1rem;
25
+ border-radius: 10px;
26
+ box-shadow: 0 4px 15px 0 rgba(31, 38, 135, 0.37);
27
+ }
28
+
29
+ .stApp {
30
+ background: var(--background-color);
31
+ }
32
+
33
+ /* Dark theme compatible containers */
34
+ .query-result {
35
+ background: rgba(255, 255, 255, 0.05);
36
+ backdrop-filter: blur(10px);
37
+ border: 1px solid rgba(255, 255, 255, 0.1);
38
+ padding: 1.5rem;
39
+ border-radius: 15px;
40
+ margin: 1rem 0;
41
+ color: var(--text-color);
42
+ }
43
+
44
+ .source-doc {
45
+ background: rgba(31, 119, 180, 0.1);
46
+ backdrop-filter: blur(5px);
47
+ padding: 1rem;
48
+ border-left: 4px solid #1f77b4;
49
+ border-radius: 8px;
50
+ margin: 0.5rem 0;
51
+ color: var(--text-color);
52
+ }
53
+
54
+ .share-link {
55
+ background: rgba(46, 204, 113, 0.1);
56
+ backdrop-filter: blur(5px);
57
+ padding: 1rem;
58
+ border-radius: 10px;
59
+ border-left: 4px solid #2ecc71;
60
+ color: var(--text-color);
61
+ }
62
+
63
+ /* Model indicator boxes */
64
+ .model-info {
65
+ background: rgba(52, 152, 219, 0.15);
66
+ backdrop-filter: blur(10px);
67
+ padding: 15px;
68
+ border-radius: 12px;
69
+ border-left: 4px solid #3498db;
70
+ margin: 10px 0;
71
+ }
72
+
73
+ /* Language selection enhancement */
74
+ .language-selection {
75
+ background: rgba(155, 89, 182, 0.1);
76
+ backdrop-filter: blur(10px);
77
+ padding: 15px;
78
+ border-radius: 12px;
79
+ border-left: 4px solid #9b59b6;
80
+ margin: 10px 0;
81
+ }
82
+
83
+ /* Upload area enhancement */
84
+ .stFileUploader {
85
+ background: rgba(230, 126, 34, 0.1);
86
+ backdrop-filter: blur(10px);
87
+ padding: 20px;
88
+ border-radius: 15px;
89
+ border: 2px dashed #e67e22;
90
+ }
91
+
92
+ .stFileUploader label {
93
+ font-size: 1.2rem;
94
+ font-weight: bold;
95
+ color: var(--text-color);
96
+ }
97
+
98
+ /* Button enhancements */
99
+ .stButton > button {
100
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
101
+ color: white;
102
+ border: none;
103
+ border-radius: 10px;
104
+ padding: 0.6rem 1.5rem;
105
+ font-weight: 600;
106
+ transition: all 0.3s ease;
107
+ box-shadow: 0 4px 15px 0 rgba(31, 38, 135, 0.37);
108
+ }
109
+
110
+ .stButton > button:hover {
111
+ transform: translateY(-2px);
112
+ box-shadow: 0 6px 20px 0 rgba(31, 38, 135, 0.5);
113
+ }
114
+
115
+ /* Sidebar enhancements */
116
+ .css-1d391kg {
117
+ background: rgba(255, 255, 255, 0.02);
118
+ backdrop-filter: blur(10px);
119
+ }
120
+
121
+ /* Info boxes */
122
+ .stInfo {
123
+ background: rgba(52, 152, 219, 0.1);
124
+ backdrop-filter: blur(10px);
125
+ border-left: 4px solid #3498db;
126
+ }
127
+
128
+ .stSuccess {
129
+ background: rgba(46, 204, 113, 0.1);
130
+ backdrop-filter: blur(10px);
131
+ border-left: 4px solid #2ecc71;
132
+ }
133
+
134
+ .stWarning {
135
+ background: rgba(241, 196, 15, 0.1);
136
+ backdrop-filter: blur(10px);
137
+ border-left: 4px solid #f1c40f;
138
+ }
139
+
140
+ .stError {
141
+ background: rgba(231, 76, 60, 0.1);
142
+ backdrop-filter: blur(10px);
143
+ border-left: 4px solid #e74c3c;
144
+ }
145
+ </style>
146
+ """, unsafe_allow_html=True)
147
+
148
+ def main():
149
+ # Check for shared query in URL
150
+ query_params = st.query_params
151
+ shared_query_id = query_params.get("share", [None])[0]
152
+
153
+ if shared_query_id:
154
+ display_shared_query(shared_query_id)
155
+ return
156
+
157
+ # Main header
158
+ st.markdown("""
159
+ <div class="main-header">
160
+ <h1>🎓 PanSea University Search</h1>
161
+ <p>AI-Powered Study Search Platform for ASEAN Universities</p>
162
+ </div>
163
+ """, unsafe_allow_html=True)
164
+
165
+ # Sidebar
166
+ with st.sidebar:
167
+ st.header("📋 Navigation")
168
+ page = st.selectbox(
169
+ "Choose a page:",
170
+ ["🔍 Search Universities", "📄 Upload Documents", "ℹ️ About"]
171
+ )
172
+
173
+ # Show embedding model info
174
+ st.markdown("---")
175
+ try:
176
+ from rag_system import RAGSystem
177
+ temp_rag = RAGSystem()
178
+ if hasattr(temp_rag.embeddings, 'model') and temp_rag.embeddings.model:
179
+ st.markdown("""
180
+ <div class='language-selection'>
181
+ <h5 style='margin: 0; color: #9b59b6;'>🔧 Embedding Model</h5>
182
+ <p style='margin: 5px 0; font-size: 0.9em;'>BGE-small-en-v1.5</p>
183
+ </div>
184
+ """, unsafe_allow_html=True)
185
+ else:
186
+ st.markdown("""
187
+ <div class='language-selection'>
188
+ <h5 style='margin: 0; color: #9b59b6;'>🔧 Embedding Model</h5>
189
+ <p style='margin: 5px 0; font-size: 0.9em;'>OpenAI Ada-002</p>
190
+ </div>
191
+ """, unsafe_allow_html=True)
192
+ except:
193
+ pass
194
+
195
+ # Main content based on selected page
196
+ if page == "📄 Upload Documents":
197
+ upload_documents_page()
198
+ elif page == "ℹ️ About":
199
+ about_page()
200
+ else:
201
+ search_page()
202
+
203
+ def upload_documents_page():
204
+ st.header("📄 Upload University Documents")
205
+ st.write("Upload PDF documents containing university admission requirements, fees, and program information.")
206
+
207
+ col1, col2 = st.columns(2)
208
+
209
+ with col1:
210
+ university_name = st.text_input("🏫 University Name", placeholder="e.g., National University of Singapore")
211
+ country = st.selectbox(
212
+ "🌏 Country",
213
+ ["", "Singapore", "Malaysia", "Thailand", "Indonesia", "Philippines", "Vietnam", "Brunei", "Cambodia", "Laos", "Myanmar"]
214
+ )
215
+
216
+ with col2:
217
+ document_type = st.selectbox(
218
+ "📋 Document Type",
219
+ ["admission_requirements", "tuition_fees", "program_catalog", "application_guide", "scholarship_info"]
220
+ )
221
+ language = st.selectbox(
222
+ "🌐 Primary Language",
223
+ ["English", "Chinese", "Malay", "Thai", "Indonesian", "Vietnamese", "Filipino", "Other"]
224
+ )
225
+
226
+ # File upload
227
+ uploaded_files = st.file_uploader(
228
+ "Choose PDF files",
229
+ accept_multiple_files=True,
230
+ type=['pdf'],
231
+ help="Select one or more PDF files to upload"
232
+ )
233
+
234
+ if uploaded_files and st.button("🚀 Process Documents", type="primary"):
235
+ if not university_name or not country:
236
+ st.error("Please provide university name and country.")
237
+ return
238
+
239
+ with st.spinner("Processing documents... This may take a few minutes."):
240
+ try:
241
+ # Initialize document ingestion
242
+ doc_ingestion = DocumentIngestion()
243
+
244
+ # Process documents
245
+ documents = doc_ingestion.process_documents(
246
+ uploaded_files, university_name, country, document_type
247
+ )
248
+
249
+ if documents:
250
+ # Create or update vector store
251
+ vectorstore = doc_ingestion.create_vector_store(documents)
252
+
253
+ if vectorstore:
254
+ st.success(f"✅ Successfully processed {len(documents)} documents!")
255
+ st.info(f"Documents from {university_name} ({country}) have been added to the knowledge base.")
256
+
257
+ # Show processed files
258
+ with st.expander("📋 Processed Files"):
259
+ for doc in documents:
260
+ st.write(f"• **{doc.metadata['source']}**")
261
+ st.write(f" - University: {doc.metadata['university']}")
262
+ st.write(f" - Country: {doc.metadata['country']}")
263
+ st.write(f" - Type: {doc.metadata['document_type']}")
264
+ st.write("---")
265
+ else:
266
+ st.error("No documents were successfully processed.")
267
+
268
+ except Exception as e:
269
+ st.error(f"Error processing documents: {str(e)}")
270
+
271
+ def search_page():
272
+ st.header("🔍 Search University Information")
273
+
274
+ # Language selection
275
+ col1, col2 = st.columns([3, 1])
276
+ with col1:
277
+ st.write("Ask questions about university admissions, requirements, fees, and programs:")
278
+ with col2:
279
+ response_language = st.selectbox(
280
+ "Response Language",
281
+ ["English", "中文 (Chinese)", "Bahasa Malaysia", "ไทย (Thai)", "Bahasa Indonesia", "Tiếng Việt (Vietnamese)"],
282
+ key="response_language"
283
+ )
284
+
285
+ # Show language info
286
+ language_map = {
287
+ "English": "English",
288
+ "中文 (Chinese)": "Chinese",
289
+ "Bahasa Malaysia": "Malay",
290
+ "ไทย (Thai)": "Thai",
291
+ "Bahasa Indonesia": "Indonesian",
292
+ "Tiếng Việt (Vietnamese)": "Vietnamese"
293
+ }
294
+ selected_lang = language_map.get(response_language, "English")
295
+
296
+ if selected_lang != "English":
297
+ st.info(f"🌐 AI will respond in **{selected_lang}** based on your selection")
298
+
299
+ # Example queries with model indicators
300
+ st.markdown("**💡 Example queries:**")
301
+
302
+ # Add model selection explanation
303
+ st.markdown("""
304
+ <div class='model-info'>
305
+ <h4 style='margin: 0; color: #3498db;'>🤖 AI Model Selection</h4>
306
+ <p style='margin: 5px 0;'><strong>🧠 Reasoning Model (SEA-LION v3.5):</strong> Complex university searches with multiple criteria, comparisons, budget constraints</p>
307
+ <p style='margin: 5px 0;'><strong>⚡ Instruct Model (SEA-LION v3):</strong> Simple questions, translations, definitions, basic information</p>
308
+ <p style='margin: 5px 0; font-style: italic;'>The system automatically chooses the best model for your query!</p>
309
+ </div>
310
+ """, unsafe_allow_html=True)
311
+
312
+ col1, col2 = st.columns(2)
313
+
314
+ with col1:
315
+ st.markdown("**🧠 Complex Queries (Uses Reasoning Model):**")
316
+ complex_examples = [
317
+ "Show me universities in Malaysia for master's degrees with tuition under 40,000 RMB per year",
318
+ "专科毕业,无雅思,想在马来西亚读硕士,学费不超过4万人民币/年",
319
+ "Compare engineering programs in Thailand and Singapore under $15,000 per year",
320
+ "Find MBA programs in ASEAN with GMAT requirements and scholarships available"
321
+ ]
322
+ for example in complex_examples:
323
+ st.markdown(f"• {example}")
324
+
325
+ with col2:
326
+ st.markdown("**⚡ Simple Queries (Uses Instruct Model):**")
327
+ simple_examples = [
328
+ "What does IELTS stand for?",
329
+ "Translate 'application deadline' to Chinese",
330
+ "What is the difference between bachelor and master degree?",
331
+ "How to say 'university' in Thai?"
332
+ ]
333
+ for example in simple_examples:
334
+ st.markdown(f"• {example}")
335
+
336
+ st.markdown("---") # Separator line
337
+
338
+ # Query input - main input field (always available)
339
+ query = st.text_area(
340
+ "Your question:",
341
+ height=100,
342
+ placeholder="e.g., What are the admission requirements for computer science programs in Singapore?",
343
+ help="Type your question here or select an example below to get started."
344
+ )
345
+
346
+ # Show search status
347
+ if query.strip():
348
+ st.success("✅ Ready to search! Click the search button when you're ready.")
349
+ else:
350
+ st.info("💭 Enter your question in the text box above to start searching.")
351
+
352
+ # Optional: Quick example selection (just for convenience)
353
+ with st.expander("💡 Example Queries (Click to Use)"):
354
+ # Combine all examples
355
+ all_examples = complex_examples + simple_examples
356
+
357
+ col1, col2 = st.columns(2)
358
+ with col1:
359
+ st.markdown("**🧠 Complex Examples:**")
360
+ for example in complex_examples:
361
+ if st.button(example[:60] + "...", key=f"ex_{hash(example)}", help=f"Click to use: {example}"):
362
+ st.session_state.example_query = example
363
+
364
+ with col2:
365
+ st.markdown("**⚡ Simple Examples:**")
366
+ for example in simple_examples:
367
+ if st.button(example[:60] + "...", key=f"ex_{hash(example)}", help=f"Click to use: {example}"):
368
+ st.session_state.example_query = example
369
+
370
+ # Use selected example if any
371
+ if hasattr(st.session_state, 'example_query') and st.session_state.example_query:
372
+ query = st.session_state.example_query
373
+ st.info(f"📝 Using example: {query[:100]}...")
374
+ # Clear the example after use
375
+ del st.session_state.example_query
376
+
377
+ # Additional filters
378
+ with st.expander("🔧 Advanced Filters (Optional)"):
379
+ col1, col2, col3 = st.columns(3)
380
+ with col1:
381
+ budget_range = st.select_slider(
382
+ "Budget Range (USD/year)",
383
+ options=["Any", "<10k", "10k-20k", "20k-30k", "30k-40k", ">40k"],
384
+ value="Any"
385
+ )
386
+ with col2:
387
+ study_level = st.multiselect(
388
+ "Study Level",
389
+ ["Diploma", "Bachelor", "Master", "PhD"],
390
+ default=[]
391
+ )
392
+ with col3:
393
+ preferred_countries = st.multiselect(
394
+ "Preferred Countries",
395
+ ["Singapore", "Malaysia", "Thailand", "Indonesia", "Philippines", "Vietnam", "Brunei"],
396
+ default=[]
397
+ )
398
+
399
+ # Search button - enabled as soon as there's text in the query
400
+ search_disabled = not query.strip()
401
+ button_text = "🔍 Search" if not search_disabled else "🔍 Search (Enter a question first)"
402
+
403
+ if st.button(button_text, type="primary", disabled=search_disabled):
404
+ if not query.strip():
405
+ st.error("Please enter a question.")
406
+ return
407
+
408
+ # Get the language code for processing
409
+ language_map = {
410
+ "English": "English",
411
+ "中文 (Chinese)": "Chinese",
412
+ "Bahasa Malaysia": "Malay",
413
+ "ไทย (Thai)": "Thai",
414
+ "Bahasa Indonesia": "Indonesian",
415
+ "Tiếng Việt (Vietnamese)": "Vietnamese"
416
+ }
417
+ language_code = language_map.get(response_language, "English")
418
+
419
+ with st.spinner("Searching for information..."):
420
+ try:
421
+ # Initialize RAG system
422
+ rag_system = RAGSystem()
423
+
424
+ # Show which model will be used
425
+ from rag_system import classify_query_type
426
+ query_type = classify_query_type(query)
427
+
428
+ if query_type == "complex":
429
+ st.info("🧠 **Using SEA-LION Reasoning Model (v3.5)** - Complex query detected")
430
+ else:
431
+ st.info("⚡ **Using SEA-LION Instruct Model (v3)** - Simple query/translation detected")
432
+
433
+ # Show translation status if not English
434
+ if response_language != "English":
435
+ st.info(f"🌐 **Translating response to {response_language}**")
436
+
437
+ # Add filters to query if specified
438
+ enhanced_query = query
439
+ if budget_range != "Any" or study_level or preferred_countries:
440
+ filters = []
441
+ if budget_range != "Any":
442
+ filters.append(f"budget range: {budget_range}")
443
+ if study_level:
444
+ filters.append(f"study levels: {', '.join(study_level)}")
445
+ if preferred_countries:
446
+ filters.append(f"countries: {', '.join(preferred_countries)}")
447
+
448
+ enhanced_query += f"\n\nAdditional filters: {'; '.join(filters)}"
449
+
450
+ # Get response
451
+ result = rag_system.query(enhanced_query, language_code)
452
+
453
+ if result:
454
+ # Save query result for sharing
455
+ save_query_result(result)
456
+
457
+ # Display results
458
+ display_query_result(result, show_share_link=True)
459
+ else:
460
+ st.error("No results found. Try rephrasing your question or upload more documents.")
461
+
462
+ except Exception as e:
463
+ st.error(f"Error searching: {str(e)}")
464
+
465
+ def display_query_result(result, show_share_link=False):
466
+ """Display query results in a formatted way."""
467
+ st.markdown('<div class="query-result">', unsafe_allow_html=True)
468
+
469
+ # Show which model was used
470
+ if result.get("model_used"):
471
+ st.info(f"🤖 **Model Used:** {result['model_used']}")
472
+
473
+ st.subheader("🎯 Answer")
474
+ st.write(result["answer"])
475
+
476
+ # Share link
477
+ if show_share_link and result.get("query_id"):
478
+ st.markdown("---")
479
+ current_url = st.get_option("browser.serverAddress") or "localhost:8501"
480
+ share_url = f"http://{current_url}?share={result['query_id']}"
481
+ st.markdown(f"""
482
+ <div class="share-link">
483
+ <strong>🔗 Share this result:</strong><br>
484
+ <code>{share_url}</code>
485
+ </div>
486
+ """, unsafe_allow_html=True)
487
+
488
+ if st.button("📋 Copy Share Link"):
489
+ st.code(share_url)
490
+
491
+ # Source documents
492
+ if result.get("source_documents"):
493
+ st.markdown("---")
494
+ st.subheader("📚 Sources")
495
+ for i, doc in enumerate(result["source_documents"], 1):
496
+ with st.expander(f"Source {i}: {doc.metadata.get('source', 'Unknown')}"):
497
+ col1, col2 = st.columns([1, 2])
498
+ with col1:
499
+ st.write(f"**University:** {doc.metadata.get('university', 'Unknown')}")
500
+ st.write(f"**Country:** {doc.metadata.get('country', 'Unknown')}")
501
+ st.write(f"**Type:** {doc.metadata.get('document_type', 'Unknown')}")
502
+ with col2:
503
+ st.write("**Relevant Content:**")
504
+ content_preview = doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content
505
+ st.write(content_preview)
506
+
507
+ st.markdown('</div>', unsafe_allow_html=True)
508
+
509
+ def display_shared_query(query_id):
510
+ """Display a shared query result."""
511
+ st.header("🔗 Shared Query Result")
512
+
513
+ result_data = load_shared_query(query_id)
514
+
515
+ if result_data:
516
+ st.info(f"**Original Question:** {result_data['question']}")
517
+ st.write(f"**Language:** {result_data['language']}")
518
+ st.write(f"**Date:** {result_data['timestamp'][:10]}")
519
+
520
+ # Create a mock result object for display
521
+ mock_result = {
522
+ "answer": result_data["answer"],
523
+ "source_documents": [
524
+ type('MockDoc', (), {
525
+ 'metadata': source,
526
+ 'page_content': source.get('content_preview', '')
527
+ })() for source in result_data.get('sources', [])
528
+ ]
529
+ }
530
+
531
+ display_query_result(mock_result, show_share_link=False)
532
+
533
+ if st.button("🔍 Ask Your Own Question"):
534
+ st.experimental_set_query_params()
535
+ st.experimental_rerun()
536
+ else:
537
+ st.error("❌ Shared query not found or has expired.")
538
+ if st.button("🏠 Go to Home"):
539
+ st.experimental_set_query_params()
540
+ st.experimental_rerun()
541
+
542
+ def about_page():
543
+ st.header("ℹ️ About PanSea University Search")
544
+
545
+ col1, col2 = st.columns([2, 1])
546
+
547
+ with col1:
548
+ st.markdown("""
549
+ ### 🎯 Problem We Solve
550
+
551
+ Prospective students worldwide seeking to study abroad face difficulty finding accurate, up-to-date university admission requirements. Information is scattered across PDFs, brochures, and outdated agency websites. Many waste time applying to unsuitable programs due to missing criteria and pay high agent fees.
552
+
553
+ ### 💡 Our Solution
554
+
555
+ PanSea is an LLM-powered, RAG-based study search platform powered by **SEA-LION models** that ingests official admissions documents from ASEAN universities. Students can query in any ASEAN language and receive:
556
+
557
+ - 📋 **Ranked program matches** with detailed requirements
558
+ - 💰 **Tuition fees and costs**
559
+ - 📅 **Application deadlines and windows**
560
+ - 🎓 **Entry requirements and prerequisites**
561
+ - 📖 **Source citations** from official documents
562
+
563
+ ### 🤖 AI Models Used
564
+
565
+ - **SEA-LION v3.5 Reasoning Model**: For complex university search queries requiring multi-step reasoning
566
+ - **SEA-LION v3 Instruct Model**: For translation and simple question-answering
567
+ - **Automatic Model Selection**: The system intelligently chooses the appropriate model based on query complexity
568
+
569
+ ### 🌏 Supported Languages
570
+
571
+ - English
572
+ - 中文 (Chinese)
573
+ - Bahasa Malaysia
574
+ - ไทย (Thai)
575
+ - Bahasa Indonesia
576
+ - Tiếng Việt (Vietnamese)
577
+ - Filipino
578
+
579
+ ### 🔧 How It Works
580
+
581
+ 1. **📄 Document Ingestion**: Upload official PDF documents from universities
582
+ 2. **🔍 AI Processing**: Our system processes and indexes the content
583
+ 3. **❓ Natural Language Queries**: Ask questions in your preferred language
584
+ 4. **🎯 Intelligent Answers**: Get relevant, sourced responses
585
+ 5. **🔗 Share Results**: Generate shareable links for your queries
586
+ """)
587
+
588
+ with col2:
589
+ st.markdown("""
590
+ ### 📊 Features
591
+
592
+ ✅ **Multi-language support**
593
+ ✅ **PDF document ingestion**
594
+ ✅ **Intelligent search & retrieval**
595
+ ✅ **Source citations**
596
+ ✅ **Shareable query results**
597
+ ✅ **Advanced filtering**
598
+ ✅ **Real-time processing**
599
+
600
+ ### 🏛️ Target Universities
601
+
602
+ - 🇸🇬 Singapore
603
+ - 🇲🇾 Malaysia
604
+ - 🇹🇭 Thailand
605
+ - 🇮🇩 Indonesia
606
+ - 🇵🇭 Philippines
607
+ - 🇻🇳 Vietnam
608
+ - 🇧🇳 Brunei
609
+ - 🇰🇭 Cambodia
610
+ - 🇱🇦 Laos
611
+ - 🇲🇲 Myanmar
612
+
613
+ ### 🚀 Get Started
614
+
615
+ 1. Go to **Upload Documents** to add university PDFs
616
+ 2. Use **Search Universities** to ask questions
617
+ 3. Share your results with others!
618
+ """)
619
+
620
+ if __name__ == "__main__":
621
+ # Check if SEA-LION API key is set
622
+ if not os.getenv("SEA_LION_API_KEY"):
623
+ st.error("🚨 SEA-LION API Key not found! Please set your SEA_LION_API_KEY in the .env file.")
624
+ st.code("SEA_LION_API_KEY=your_api_key_here")
625
+ st.stop()
626
+
627
+ # Check if OpenAI API key is set (needed for embeddings)
628
+ if not os.getenv("OPENAI_API_KEY") or os.getenv("OPENAI_API_KEY") == "your_openai_api_key_here":
629
+ st.warning("⚠️ OpenAI API Key not configured properly. You'll need it for document embeddings.")
630
+ st.info("The system will use SEA-LION models for text generation, but OpenAI for document embeddings.")
631
+
632
+ main()
rag_system.py ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import tempfile
4
+ from typing import List, Optional, Dict, Any
5
+ import streamlit as st
6
+ from pathlib import Path
7
+ import PyPDF2
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
10
+ from langchain_community.vectorstores import Chroma
11
+ from langchain.chains import RetrievalQA
12
+ from langchain_community.document_loaders import PyPDFLoader
13
+ from langchain.schema import Document
14
+ from dotenv import load_dotenv
15
+ import chromadb
16
+ from datetime import datetime
17
+ import json
18
+ import base64
19
+ from openai import OpenAI
20
+ import re
21
+
22
+ # Load environment variables
23
+ load_dotenv()
24
+
25
+ class AlternativeEmbeddings:
26
+ """Alternative embeddings using Sentence Transformers when OpenAI is not available"""
27
+
28
+ def __init__(self):
29
+ try:
30
+ from sentence_transformers import SentenceTransformer
31
+ # Use BGE-small-en for better performance
32
+ self.model = SentenceTransformer('BAAI/bge-small-en-v1.5')
33
+ self.embedding_size = 384
34
+ except ImportError:
35
+ st.error("sentence-transformers not available. Please install it or provide OpenAI API key.")
36
+ self.model = None
37
+
38
+ def embed_documents(self, texts):
39
+ if not self.model:
40
+ return []
41
+ return self.model.encode(texts).tolist()
42
+
43
+ def embed_query(self, text):
44
+ if not self.model:
45
+ return []
46
+ return self.model.encode([text])[0].tolist()
47
+
48
+ class SEALionLLM:
49
+ """Custom LLM class for SEA-LION models"""
50
+
51
+ def __init__(self):
52
+ self.client = OpenAI(
53
+ api_key=os.getenv("SEA_LION_API_KEY"),
54
+ base_url=os.getenv("SEA_LION_BASE_URL", "https://api.sea-lion.ai/v1")
55
+ )
56
+
57
+ # Model configurations
58
+ self.instruct_model = "aisingapore/Gemma-SEA-LION-v3-9B-IT"
59
+ self.reasoning_model = "aisingapore/Llama-SEA-LION-v3.5-8B-R"
60
+
61
+ def _is_complex_query(self, query: str) -> bool:
62
+ """Determine if query requires reasoning model or simple instruct model"""
63
+ # Keywords that indicate complex university search queries
64
+ complex_keywords = [
65
+ "university", "admission", "requirement", "tuition", "fee", "program", "course",
66
+ "degree", "master", "bachelor", "phd", "scholarship", "deadline", "application",
67
+ "budget", "under", "less than", "below", "compare", "recommend", "suggest",
68
+ "which", "what are the", "show me", "find me", "search for",
69
+ # Chinese keywords
70
+ "大学", "学费", "专业", "硕士", "学士", "博士", "申请", "要求", "奖学金",
71
+ # Malay keywords
72
+ "universiti", "yuran", "program", "ijazah", "syarat", "permohonan",
73
+ # Thai keywords
74
+ "มหาวิทยาลัย", "ค่าเล่าเรียน", "หลักสูตร", "ปริญญา", "เงื่อนไข",
75
+ # Indonesian keywords
76
+ "universitas", "biaya", "kuliah", "program", "sarjana", "persyaratan"
77
+ ]
78
+
79
+ # Check for multiple criteria (indicates complex search)
80
+ criteria_count = 0
81
+ query_lower = query.lower()
82
+
83
+ for keyword in complex_keywords:
84
+ if keyword.lower() in query_lower:
85
+ criteria_count += 1
86
+
87
+ # Also check for comparison words, numbers, conditions
88
+ comparison_patterns = [
89
+ r'under \$?\d+', r'less than \$?\d+', r'below \$?\d+', r'between \$?\d+ and \$?\d+',
90
+ r'不超过.*元', r'低于.*元', r'少于.*元', # Chinese
91
+ r'kurang dari', r'di bawah', # Malay/Indonesian
92
+ r'น้อยกว่า', r'ต่ำกว่า' # Thai
93
+ ]
94
+
95
+ for pattern in comparison_patterns:
96
+ if re.search(pattern, query_lower):
97
+ criteria_count += 2
98
+
99
+ # Complex query if multiple keywords or comparison patterns found
100
+ return criteria_count >= 2
101
+
102
+ def _is_translation_query(self, query: str) -> bool:
103
+ """Check if query is primarily for translation"""
104
+ translation_keywords = [
105
+ "translate", "translation", "แปล", "翻译", "terjemah", "traduire"
106
+ ]
107
+
108
+ query_lower = query.lower()
109
+ return any(keyword in query_lower for keyword in translation_keywords)
110
+
111
+ def generate_response(self, query: str, context: str = "", language: str = "English") -> str:
112
+ """Generate response using appropriate SEA-LION model"""
113
+
114
+ # Choose model based on query complexity
115
+ if self._is_translation_query(query) or not self._is_complex_query(query):
116
+ model = self.instruct_model
117
+ use_reasoning = False
118
+ else:
119
+ model = self.reasoning_model
120
+ use_reasoning = True
121
+
122
+ # Prepare messages
123
+ system_prompt = f"""You are a helpful assistant specializing in ASEAN university admissions.
124
+ Respond in {language} unless specifically asked otherwise.
125
+
126
+ If provided with context from university documents, use that information to give accurate, specific answers.
127
+ Always cite your sources when using provided context.
128
+
129
+ For complex university search queries, provide:
130
+ 1. Direct answers to the question
131
+ 2. Relevant admission requirements
132
+ 3. Tuition fees (if available)
133
+ 4. Application deadlines (if available)
134
+ 5. Source citations from the documents
135
+
136
+ Context: {context}"""
137
+
138
+ messages = [
139
+ {"role": "system", "content": system_prompt},
140
+ {"role": "user", "content": query}
141
+ ]
142
+
143
+ try:
144
+ if use_reasoning:
145
+ # Use reasoning model with thinking mode
146
+ response = self.client.chat.completions.create(
147
+ model=model,
148
+ messages=messages,
149
+ max_tokens=2000,
150
+ temperature=0.1,
151
+ extra_body={"thinking_mode": True}
152
+ )
153
+ else:
154
+ # Use instruct model for simpler queries
155
+ response = self.client.chat.completions.create(
156
+ model=model,
157
+ messages=messages,
158
+ max_tokens=1500,
159
+ temperature=0.3
160
+ )
161
+
162
+ return response.choices[0].message.content
163
+
164
+ except Exception as e:
165
+ st.error(f"Error with SEA-LION model: {str(e)}")
166
+ # Fallback to a simple response
167
+ return f"I apologize, but I encountered an error processing your query. Please try rephrasing your question. Error: {str(e)}"
168
+
169
+ def classify_query_type(query: str) -> str:
170
+ """Public function to classify query type for UI display"""
171
+ # Create a temporary SEALionLLM instance just for classification
172
+ temp_llm = SEALionLLM()
173
+
174
+ if temp_llm._is_translation_query(query) or not temp_llm._is_complex_query(query):
175
+ return "simple"
176
+ else:
177
+ return "complex"
178
+
179
+ class DocumentIngestion:
180
+ def __init__(self):
181
+ # Use BGE embeddings by default for better performance
182
+ try:
183
+ self.embeddings = AlternativeEmbeddings()
184
+ self.embedding_type = "BGE-small-en"
185
+ if not self.embeddings.model:
186
+ raise Exception("BGE model not available")
187
+ except Exception:
188
+ # Fallback to OpenAI if BGE not available
189
+ openai_key = os.getenv("OPENAI_API_KEY")
190
+ if openai_key and openai_key != "placeholder_for_embeddings" and openai_key != "your_openai_api_key_here":
191
+ try:
192
+ self.embeddings = OpenAIEmbeddings()
193
+ self.embedding_type = "OpenAI"
194
+ except Exception as e:
195
+ st.error("Both BGE and OpenAI embeddings failed. Please check your setup.")
196
+ raise e
197
+ else:
198
+ st.error("No embedding model available. Please install sentence-transformers or provide OpenAI API key.")
199
+ raise Exception("No embedding model available")
200
+
201
+ self.text_splitter = RecursiveCharacterTextSplitter(
202
+ chunk_size=1000,
203
+ chunk_overlap=200,
204
+ separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
205
+ )
206
+ self.persist_directory = os.getenv("CHROMA_PERSIST_DIRECTORY", "./chroma_db")
207
+ os.makedirs(self.persist_directory, exist_ok=True)
208
+
209
+ def extract_text_from_pdf(self, pdf_file) -> str:
210
+ """Extract text from uploaded PDF file with multiple fallback methods."""
211
+ try:
212
+ # Method 1: Try with PyPDF2 (handles most PDFs including encrypted ones with PyCryptodome)
213
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
214
+
215
+ # Check if PDF is encrypted
216
+ if pdf_reader.is_encrypted:
217
+ # Try to decrypt with empty password (common for protected but not password-protected PDFs)
218
+ try:
219
+ pdf_reader.decrypt("")
220
+ except Exception:
221
+ st.warning(f"PDF {pdf_file.name} is password-protected. Please provide an unprotected version.")
222
+ return ""
223
+
224
+ text = ""
225
+ for page_num, page in enumerate(pdf_reader.pages):
226
+ try:
227
+ page_text = page.extract_text()
228
+ text += page_text + "\n"
229
+ except Exception as e:
230
+ st.warning(f"Could not extract text from page {page_num + 1} of {pdf_file.name}: {str(e)}")
231
+ continue
232
+
233
+ if text.strip():
234
+ return text
235
+ else:
236
+ st.warning(f"No extractable text found in {pdf_file.name}. This might be a scanned PDF or image-based document.")
237
+ return ""
238
+
239
+ except Exception as e:
240
+ error_msg = str(e)
241
+ if "PyCryptodome" in error_msg:
242
+ st.error(f"Encryption error with {pdf_file.name}: {error_msg}")
243
+ st.info("💡 The PDF uses encryption. PyCryptodome has been installed to handle this.")
244
+ elif "password" in error_msg.lower():
245
+ st.error(f"Password-protected PDF: {pdf_file.name}")
246
+ st.info("💡 Please provide an unprotected version of this PDF.")
247
+ else:
248
+ st.error(f"Error extracting text from {pdf_file.name}: {error_msg}")
249
+ return ""
250
+
251
+ def process_documents(self, uploaded_files, university_name: str = "",
252
+ country: str = "", document_type: str = "admission_requirements") -> List[Document]:
253
+ """Process uploaded PDF files and convert to documents."""
254
+ documents = []
255
+ processed_count = 0
256
+ failed_count = 0
257
+
258
+ st.info(f"📄 Processing {len(uploaded_files)} document(s)...")
259
+
260
+ for uploaded_file in uploaded_files:
261
+ if uploaded_file.type == "application/pdf":
262
+ st.write(f"🔍 Extracting text from: **{uploaded_file.name}**")
263
+
264
+ # Extract text
265
+ text = self.extract_text_from_pdf(uploaded_file)
266
+
267
+ if text.strip():
268
+ # Create metadata
269
+ metadata = {
270
+ "source": uploaded_file.name,
271
+ "university": university_name,
272
+ "country": country,
273
+ "document_type": document_type,
274
+ "upload_timestamp": datetime.now().isoformat(),
275
+ "file_id": str(uuid.uuid4())
276
+ }
277
+
278
+ # Create document
279
+ doc = Document(
280
+ page_content=text,
281
+ metadata=metadata
282
+ )
283
+ documents.append(doc)
284
+ processed_count += 1
285
+ st.success(f"✅ Successfully processed: **{uploaded_file.name}** ({len(text)} characters)")
286
+ else:
287
+ failed_count += 1
288
+ st.warning(f"⚠️ Could not extract text from **{uploaded_file.name}**")
289
+ else:
290
+ failed_count += 1
291
+ st.error(f"❌ Unsupported file type: **{uploaded_file.type}** for {uploaded_file.name}")
292
+
293
+ # Summary
294
+ if processed_count > 0:
295
+ st.success(f"🎉 Successfully processed **{processed_count}** document(s)")
296
+ if failed_count > 0:
297
+ st.warning(f"⚠️ Failed to process **{failed_count}** document(s)")
298
+
299
+ return documents
300
+
301
+ def create_vector_store(self, documents: List[Document]) -> Chroma:
302
+ """Create and persist vector store from documents."""
303
+ if not documents:
304
+ st.error("No documents to process")
305
+ return None
306
+
307
+ # Split documents into chunks
308
+ texts = self.text_splitter.split_documents(documents)
309
+
310
+ # Create vector store
311
+ vectorstore = Chroma.from_documents(
312
+ documents=texts,
313
+ embedding=self.embeddings,
314
+ persist_directory=self.persist_directory
315
+ )
316
+
317
+ return vectorstore
318
+
319
+ def load_existing_vectorstore(self) -> Optional[Chroma]:
320
+ """Load existing vector store if it exists."""
321
+ try:
322
+ vectorstore = Chroma(
323
+ persist_directory=self.persist_directory,
324
+ embedding_function=self.embeddings
325
+ )
326
+ return vectorstore
327
+ except Exception as e:
328
+ st.warning(f"Could not load existing vector store: {str(e)}")
329
+ return None
330
+
331
+ class RAGSystem:
332
+ def __init__(self):
333
+ # Initialize embeddings - try BGE first, fallback to OpenAI
334
+ try:
335
+ self.embeddings = AlternativeEmbeddings()
336
+ if not self.embeddings.model:
337
+ # Fallback to OpenAI if BGE not available
338
+ self.embeddings = OpenAIEmbeddings()
339
+ except Exception:
340
+ # If both fail, use OpenAI as last resort
341
+ self.embeddings = OpenAIEmbeddings()
342
+
343
+ self.sea_lion_llm = SEALionLLM()
344
+ self.persist_directory = os.getenv("CHROMA_PERSIST_DIRECTORY", "./chroma_db")
345
+
346
+ def get_vectorstore(self) -> Optional[Chroma]:
347
+ """Get the vector store."""
348
+ try:
349
+ vectorstore = Chroma(
350
+ persist_directory=self.persist_directory,
351
+ embedding_function=self.embeddings
352
+ )
353
+ return vectorstore
354
+ except Exception as e:
355
+ st.error(f"Error loading vector store: {str(e)}")
356
+ return None
357
+
358
+ def query(self, question: str, language: str = "English") -> Dict[str, Any]:
359
+ """Query the RAG system using SEA-LION models."""
360
+ vectorstore = self.get_vectorstore()
361
+ if not vectorstore:
362
+ return {
363
+ "answer": "No documents have been ingested yet. Please upload some PDF documents first.",
364
+ "source_documents": [],
365
+ "query_id": None
366
+ }
367
+
368
+ try:
369
+ # Retrieve relevant documents
370
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
371
+ relevant_docs = retriever.get_relevant_documents(question)
372
+
373
+ # Prepare context from retrieved documents
374
+ context_parts = []
375
+ for i, doc in enumerate(relevant_docs, 1):
376
+ source_info = doc.metadata.get('source', 'Unknown')
377
+ university = doc.metadata.get('university', 'Unknown')
378
+ country = doc.metadata.get('country', 'Unknown')
379
+
380
+ context_parts.append(f"""
381
+ Document {i} (Source: {source_info}, University: {university}, Country: {country}):
382
+ {doc.page_content[:500]}...
383
+ """)
384
+
385
+ context = "\n".join(context_parts)
386
+
387
+ # Generate response using SEA-LION model
388
+ answer = self.sea_lion_llm.generate_response(
389
+ query=question,
390
+ context=context,
391
+ language=language
392
+ )
393
+
394
+ # Generate query ID for sharing
395
+ query_id = str(uuid.uuid4())
396
+
397
+ return {
398
+ "answer": answer,
399
+ "source_documents": relevant_docs,
400
+ "query_id": query_id,
401
+ "original_question": question,
402
+ "language": language,
403
+ "model_used": "SEA-LION" + (" Reasoning" if self.sea_lion_llm._is_complex_query(question) else " Instruct")
404
+ }
405
+
406
+ except Exception as e:
407
+ st.error(f"Error querying system: {str(e)}")
408
+ return {
409
+ "answer": f"Error processing your question: {str(e)}",
410
+ "source_documents": [],
411
+ "query_id": None
412
+ }
413
+
414
+ def save_query_result(query_result: Dict[str, Any]):
415
+ """Save query result for sharing."""
416
+ if query_result.get("query_id"):
417
+ results_dir = "query_results"
418
+ os.makedirs(results_dir, exist_ok=True)
419
+
420
+ result_file = f"{results_dir}/{query_result['query_id']}.json"
421
+
422
+ # Prepare data for saving (remove non-serializable objects)
423
+ save_data = {
424
+ "query_id": query_result["query_id"],
425
+ "question": query_result.get("original_question", ""),
426
+ "answer": query_result["answer"],
427
+ "language": query_result.get("language", "English"),
428
+ "timestamp": datetime.now().isoformat(),
429
+ "sources": [
430
+ {
431
+ "source": doc.metadata.get("source", "Unknown"),
432
+ "university": doc.metadata.get("university", "Unknown"),
433
+ "country": doc.metadata.get("country", "Unknown"),
434
+ "content_preview": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
435
+ }
436
+ for doc in query_result.get("source_documents", [])
437
+ ]
438
+ }
439
+
440
+ try:
441
+ with open(result_file, 'w', encoding='utf-8') as f:
442
+ json.dump(save_data, f, indent=2, ensure_ascii=False)
443
+ return True
444
+ except Exception as e:
445
+ st.error(f"Error saving query result: {str(e)}")
446
+ return False
447
+ return False
448
+
449
+ def load_shared_query(query_id: str) -> Optional[Dict[str, Any]]:
450
+ """Load a shared query result."""
451
+ result_file = f"query_results/{query_id}.json"
452
+
453
+ if os.path.exists(result_file):
454
+ try:
455
+ with open(result_file, 'r', encoding='utf-8') as f:
456
+ return json.load(f)
457
+ except Exception as e:
458
+ st.error(f"Error loading shared query: {str(e)}")
459
+
460
+ return None
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.1.0
2
+ langchain-community==0.0.10
3
+ langchain-openai==0.0.5
4
+ streamlit==1.29.0
5
+ pypdf2==3.0.1
6
+ pycryptodome==3.23.0
7
+ chromadb==0.4.22
8
+ sentence-transformers==5.1.0
9
+ faiss-cpu==1.7.4
10
+ python-dotenv==1.0.0
11
+ openai==1.6.1
12
+ tiktoken==0.5.2
13
+ streamlit-extras==0.3.5
14
+ watchdog==3.0.0
requirements_clean.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-community
3
+ langchain-openai
4
+ streamlit
5
+ pypdf2
6
+ chromadb
7
+ sentence-transformers
8
+ python-dotenv
9
+ openai
10
+ tiktoken
11
+ streamlit-extras
sample_documents/sample_university_requirements.txt ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Sample University Admission Requirements
2
+
3
+ ## National University of Singapore (NUS) - Computer Science Master's Program
4
+
5
+ ### Program Overview
6
+ The Master of Computing (Computer Science) program at NUS is a comprehensive graduate program designed for students seeking advanced knowledge in computer science.
7
+
8
+ ### Admission Requirements
9
+
10
+ #### Academic Requirements
11
+ - Bachelor's degree in Computer Science, Computer Engineering, or related field
12
+ - Minimum GPA of 3.5/4.0 or equivalent (Second Class Upper Honours)
13
+ - Strong background in mathematics and programming
14
+
15
+ #### English Proficiency Requirements
16
+ For international students whose native language is not English:
17
+ - IELTS: Minimum overall score of 6.5 (no band less than 6.0)
18
+ - TOEFL iBT: Minimum score of 85 (writing minimum 22)
19
+ - PTE Academic: Minimum score of 65
20
+
21
+ #### Additional Requirements
22
+ - Statement of Purpose (500-1000 words)
23
+ - Two letters of recommendation from academic or professional referees
24
+ - Resume/CV highlighting relevant experience
25
+ - Portfolio of programming projects (preferred)
26
+
27
+ ### Tuition Fees (2024-2025 Academic Year)
28
+ - Singapore Citizens: S$12,500 per year
29
+ - Singapore Permanent Residents: S$17,500 per year
30
+ - International Students: S$25,000 per year
31
+
32
+ ### Application Deadlines
33
+ - **Priority Round**: November 15, 2024
34
+ - **Regular Round**: January 31, 2025
35
+ - **Late Round**: March 15, 2025 (subject to availability)
36
+
37
+ ### Application Process
38
+ 1. Submit online application through NUS Graduate School portal
39
+ 2. Upload required documents
40
+ 3. Pay application fee of S$50
41
+ 4. Submit by deadline
42
+ 5. Attend interview if shortlisted (February-April)
43
+ 6. Admission results: April-May
44
+
45
+ ### Program Duration
46
+ - Full-time: 1.5 years (3 semesters)
47
+ - Part-time: 2.5 years (5 semesters)
48
+
49
+ ### Financial Aid
50
+ - NUS Graduate Research Scholarship available for qualifying students
51
+ - Teaching assistantships for outstanding applicants
52
+ - Industry sponsorship opportunities
53
+
54
+ ### Contact Information
55
+ - Email: [email protected]
56
+ - Phone: +65 6516 2492
57
+ - Website: www.nus.edu.sg/graduateschool
58
+
59
+ ---
60
+
61
+ ## University of Malaya (UM) - Engineering Master's Programs
62
+
63
+ ### Program Overview
64
+ The Faculty of Engineering offers various Master's degree programs in engineering disciplines.
65
+
66
+ ### Admission Requirements
67
+
68
+ #### Academic Requirements
69
+ - Bachelor's degree in Engineering or related field with minimum CGPA of 3.0/4.0
70
+ - For applicants with CGPA below 3.0, relevant work experience of at least 2 years required
71
+
72
+ #### English Proficiency Requirements
73
+ For international students:
74
+ - IELTS: Minimum overall score of 6.0 (no band less than 5.5)
75
+ - TOEFL iBT: Minimum score of 80
76
+ - MUET (Malaysian University English Test): Band 4 minimum
77
+
78
+ #### Program-Specific Requirements
79
+ - **Civil Engineering**: AutoCAD proficiency preferred
80
+ - **Electrical Engineering**: Basic knowledge of circuit analysis
81
+ - **Mechanical Engineering**: Thermodynamics and fluid mechanics background
82
+
83
+ ### Tuition Fees (2024 Academic Year)
84
+ - Malaysian Citizens: RM 8,000 per year
85
+ - International Students: RM 15,000 per year
86
+ - ASEAN Students: RM 12,000 per year (special rate)
87
+
88
+ ### Application Deadlines
89
+ - **Main Intake (September)**: April 30, 2024
90
+ - **Second Intake (February)**: October 31, 2024
91
+
92
+ ### Scholarships Available
93
+ - UM Graduate Merit Scholarship (50% tuition fee waiver)
94
+ - ASEAN Scholarship Program
95
+ - Industry-sponsored scholarships
96
+
97
+ ### Living Costs (Estimated per month)
98
+ - Accommodation: RM 500-800
99
+ - Food: RM 400-600
100
+ - Transportation: RM 100-200
101
+ - Other expenses: RM 200-300
102
+ - **Total: RM 1,200-1,900 per month**
103
+
104
+ ### Application Requirements
105
+ 1. Completed application form
106
+ 2. Academic transcripts
107
+ 3. Bachelor's degree certificate
108
+ 4. English proficiency test results
109
+ 5. Two reference letters
110
+ 6. Research proposal (for research-based programs)
111
+ 7. Passport copy
112
+ 8. Passport-sized photographs
113
+
114
+ ### Contact Information
115
+ - Email: [email protected]
116
+ - Phone: +603 7967 3026
117
+ - Address: Faculty of Engineering, University of Malaya, 50603 Kuala Lumpur, Malaysia
118
+
119
+ ---
120
+
121
+ ## Chulalongkorn University - Business Administration Master's (MBA)
122
+
123
+ ### Program Overview
124
+ The Chulalongkorn Business School MBA program is Thailand's premier business education program.
125
+
126
+ ### Admission Requirements
127
+
128
+ #### Academic Requirements
129
+ - Bachelor's degree from accredited institution
130
+ - Minimum GPA of 2.75/4.0 or equivalent
131
+ - GMAT score of 500+ (preferred) or GRE equivalent
132
+ - Minimum 2 years of work experience
133
+
134
+ #### English Proficiency Requirements
135
+ - TOEFL iBT: Minimum score of 79
136
+ - IELTS: Minimum overall score of 6.5
137
+ - CU-TEP: Minimum score of 80
138
+
139
+ ### Tuition Fees (2024-2025)
140
+ - Full-time MBA: 850,000 THB (approx. USD 25,000) total program
141
+ - Executive MBA: 1,200,000 THB (approx. USD 35,000) total program
142
+
143
+ ### Application Deadlines
144
+ - **Early Admission**: February 15, 2024
145
+ - **Regular Admission**: April 30, 2024
146
+ - **Final Round**: June 15, 2024
147
+
148
+ ### Program Duration
149
+ - Full-time MBA: 16 months
150
+ - Executive MBA: 18 months (weekend classes)
151
+
152
+ ### Scholarships
153
+ - Merit-based scholarships up to 50% tuition
154
+ - Corporate sponsorship opportunities
155
+ - Government scholarships for ASEAN students
156
+
157
+ ### Application Process
158
+ 1. Online application submission
159
+ 2. Submit required documents
160
+ 3. GMAT/GRE scores
161
+ 4. Personal interview
162
+ 5. Group discussion assessment
163
+
164
+ ### Career Support
165
+ - Career counseling services
166
+ - Industry networking events
167
+ - Internship placement assistance
168
+ - Alumni network access
169
+
170
+ ### Contact Information
171
+ - Email: [email protected]
172
+ - Phone: +66 2 218 6601
173
+ - Website: www.cbs.chula.ac.th
174
+
175
+ ---
176
+
177
+ *This document contains sample admission information for demonstration purposes. Please verify all details with the respective universities before applying.*
start.sh ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # PanSea University Search - Startup Script
4
+
5
+ echo "🎓 Starting PanSea University Search..."
6
+
7
+ # Check if virtual environment exists
8
+ if [ ! -d ".venv" ]; then
9
+ echo "❌ Virtual environment not found. Please run setup first."
10
+ exit 1
11
+ fi
12
+
13
+ # Activate virtual environment
14
+ source .venv/bin/activate
15
+
16
+ # Check if .env file exists
17
+ if [ ! -f ".env" ]; then
18
+ echo "⚠️ .env file not found. Please create one with your OpenAI API key."
19
+ echo "Example:"
20
+ echo "OPENAI_API_KEY=your_api_key_here"
21
+ exit 1
22
+ fi
23
+
24
+ # Create necessary directories
25
+ mkdir -p chroma_db
26
+ mkdir -p documents
27
+ mkdir -p query_results
28
+
29
+ # Check if required packages are installed
30
+ echo "🔍 Checking dependencies..."
31
+ python -c "import streamlit, langchain, chromadb" 2>/dev/null
32
+ if [ $? -ne 0 ]; then
33
+ echo "❌ Dependencies not found. Installing..."
34
+ pip install -r requirements.txt
35
+ fi
36
+
37
+ echo "🚀 Starting Streamlit application..."
38
+ echo "📱 Open your browser to: http://localhost:8501"
39
+ echo "🛑 Press Ctrl+C to stop the application"
40
+ echo ""
41
+
42
+ # Start the Streamlit app
43
+ streamlit run app.py --server.port=8501 --server.address=0.0.0.0
test_system.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script for PanSea University Search System
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ # Add the project directory to Python path
11
+ project_dir = Path(__file__).parent
12
+ sys.path.insert(0, str(project_dir))
13
+
14
+ def test_imports():
15
+ """Test if all required modules can be imported."""
16
+ print("🧪 Testing imports...")
17
+
18
+ try:
19
+ import streamlit
20
+ print("✅ Streamlit imported successfully")
21
+ except ImportError as e:
22
+ print(f"❌ Failed to import Streamlit: {e}")
23
+ return False
24
+
25
+ try:
26
+ import langchain
27
+ print("✅ LangChain imported successfully")
28
+ except ImportError as e:
29
+ print(f"❌ Failed to import LangChain: {e}")
30
+ return False
31
+
32
+ try:
33
+ import chromadb
34
+ print("✅ ChromaDB imported successfully")
35
+ except ImportError as e:
36
+ print(f"❌ Failed to import ChromaDB: {e}")
37
+ return False
38
+
39
+ try:
40
+ from rag_system import DocumentIngestion, RAGSystem
41
+ print("✅ RAG system modules imported successfully")
42
+ except ImportError as e:
43
+ print(f"❌ Failed to import RAG system: {e}")
44
+ return False
45
+
46
+ return True
47
+
48
+ def test_environment():
49
+ """Test environment configuration."""
50
+ print("\n🔧 Testing environment...")
51
+
52
+ # Check if .env file exists
53
+ env_file = project_dir / ".env"
54
+ if not env_file.exists():
55
+ print("⚠️ .env file not found. You'll need to create one with your OpenAI API key.")
56
+ return False
57
+
58
+ # Load environment variables
59
+ try:
60
+ from dotenv import load_dotenv
61
+ load_dotenv()
62
+
63
+ # Check SEA-LION API key
64
+ sea_lion_key = os.getenv("SEA_LION_API_KEY")
65
+ if not sea_lion_key:
66
+ print("⚠️ SEA_LION_API_KEY not found in .env file")
67
+ return False
68
+ elif sea_lion_key == "your_sea_lion_api_key_here":
69
+ print("⚠️ Please update SEA_LION_API_KEY in .env file with your actual API key")
70
+ return False
71
+ else:
72
+ print(f"✅ SEA-LION API key configured (length: {len(sea_lion_key)})")
73
+
74
+ # Check OpenAI API key (for embeddings)
75
+ openai_key = os.getenv("OPENAI_API_KEY")
76
+ if not openai_key or openai_key == "your_openai_api_key_here":
77
+ print("⚠️ OpenAI API key not properly configured. This is needed for document embeddings.")
78
+ return False
79
+ else:
80
+ print(f"✅ OpenAI API key configured (length: {len(openai_key)})")
81
+
82
+ return True
83
+
84
+ except ImportError as e:
85
+ print(f"❌ Failed to load environment: {e}")
86
+ return False
87
+
88
+ def test_directories():
89
+ """Test if required directories exist or can be created."""
90
+ print("\n📁 Testing directories...")
91
+
92
+ required_dirs = ["chroma_db", "documents", "query_results"]
93
+
94
+ for dir_name in required_dirs:
95
+ dir_path = project_dir / dir_name
96
+ try:
97
+ dir_path.mkdir(exist_ok=True)
98
+ print(f"✅ Directory '{dir_name}' ready")
99
+ except Exception as e:
100
+ print(f"❌ Failed to create directory '{dir_name}': {e}")
101
+ return False
102
+
103
+ return True
104
+
105
+ def test_basic_functionality():
106
+ """Test basic RAG system functionality."""
107
+ print("\n⚡ Testing basic functionality...")
108
+
109
+ try:
110
+ from rag_system import DocumentIngestion, SEALionLLM
111
+
112
+ # Test document ingestion initialization
113
+ doc_ingestion = DocumentIngestion()
114
+ print("✅ DocumentIngestion initialized successfully")
115
+
116
+ # Test SEA-LION LLM
117
+ sea_lion = SEALionLLM()
118
+ print("✅ SEALionLLM initialized successfully")
119
+
120
+ # Test query classification
121
+ complex_query = "Show me universities in Malaysia for master's degree under 40000 RMB"
122
+ simple_query = "What does IELTS stand for?"
123
+
124
+ is_complex1 = sea_lion._is_complex_query(complex_query)
125
+ is_complex2 = sea_lion._is_complex_query(simple_query)
126
+
127
+ if is_complex1 and not is_complex2:
128
+ print("✅ Query classification working correctly")
129
+ else:
130
+ print(f"⚠️ Query classification may need adjustment (complex: {is_complex1}, simple: {is_complex2})")
131
+
132
+ # Test text splitter
133
+ text = "This is a test document. It contains multiple sentences. Each sentence should be processed correctly."
134
+ docs = doc_ingestion.text_splitter.split_text(text)
135
+ if docs:
136
+ print(f"✅ Text splitter working (created {len(docs)} chunks)")
137
+ else:
138
+ print("❌ Text splitter not working")
139
+ return False
140
+
141
+ return True
142
+
143
+ except Exception as e:
144
+ print(f"❌ Basic functionality test failed: {e}")
145
+ return False
146
+
147
+ def main():
148
+ """Run all tests."""
149
+ print("🎓 PanSea University Search - System Test")
150
+ print("=" * 50)
151
+
152
+ tests = [
153
+ ("Import Test", test_imports),
154
+ ("Environment Test", test_environment),
155
+ ("Directory Test", test_directories),
156
+ ("Functionality Test", test_basic_functionality)
157
+ ]
158
+
159
+ all_passed = True
160
+
161
+ for test_name, test_func in tests:
162
+ try:
163
+ result = test_func()
164
+ if not result:
165
+ all_passed = False
166
+ except Exception as e:
167
+ print(f"❌ {test_name} failed with exception: {e}")
168
+ all_passed = False
169
+
170
+ print("\n" + "=" * 50)
171
+ if all_passed:
172
+ print("✅ All tests passed! Your system is ready to use.")
173
+ print("\nTo start the application, run:")
174
+ print(" ./start.sh")
175
+ print("\nOr manually with:")
176
+ print(" streamlit run app.py")
177
+ else:
178
+ print("❌ Some tests failed. Please fix the issues above before running the application.")
179
+ print("\nCommon solutions:")
180
+ print("1. Make sure you've activated the virtual environment:")
181
+ print(" source .venv/bin/activate")
182
+ print("2. Install dependencies:")
183
+ print(" pip install -r requirements.txt")
184
+ print("3. Create .env file with your API keys:")
185
+ print(" SEA_LION_API_KEY=your_sea_lion_api_key")
186
+ print(" OPENAI_API_KEY=your_openai_api_key")
187
+
188
+ return all_passed
189
+
190
+ if __name__ == "__main__":
191
+ success = main()
192
+ sys.exit(0 if success else 1)