Spaces:

ErvinYubo
/

TopEdu

Sleeping

App Files Files Community

Ervinoreo commited on 25 days ago

Commit

e9e516f

1 Parent(s): dab98a9

firsst

Browse files

Files changed (8) hide show

.gitignore +109 -0
app.py +632 -0
rag_system.py +460 -0
requirements.txt +14 -0
requirements_clean.txt +11 -0
sample_documents/sample_university_requirements.txt +177 -0
start.sh +43 -0
test_system.py +192 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,109 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Virtual Environment
+.venv/
+.env/
+venv/
+ENV/
+env/
+.venv
+# Environment Variables
+.env
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# macOS
+.DS_Store
+.AppleDouble
+.LSOverride
+# Windows
+Thumbs.db
+ehthumbs.db
+Desktop.ini
+# Jupyter Notebooks
+.ipynb_checkpoints
+# AI/ML specific
+chroma_db/
+chromadb/
+*.db
+*.sqlite
+*.sqlite3
+# Document storage
+documents/
+uploaded_documents/
+temp_documents/
+# Query results and cache
+query_results/
+.cache/
+.streamlit/
+# Model downloads and cache
+models/
+.transformers_cache/
+.huggingface/
+sentence_transformers_cache/
+# Logs
+*.log
+logs/
+.logs/
+# Temporary files
+tmp/
+temp/
+.tmp/
+# Coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json

app.py ADDED Viewed

	@@ -0,0 +1,632 @@

+import streamlit as st
+import os
+from urllib.parse import urlparse, parse_qs
+from rag_system import DocumentIngestion, RAGSystem, save_query_result, load_shared_query
+from datetime import datetime
+import uuid
+# Configure Streamlit page
+st.set_page_config(
+    page_title="PanSea University Search",
+    page_icon="🎓",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS - Dark theme compatible
+st.markdown("""
+<style>
+    .main-header {
+        text-align: center;
+        padding: 2rem 0;
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        margin: -1rem -1rem 2rem -1rem;
+        border-radius: 10px;
+        box-shadow: 0 4px 15px 0 rgba(31, 38, 135, 0.37);
+    }
+    .stApp {
+        background: var(--background-color);
+    }
+    /* Dark theme compatible containers */
+    .query-result {
+        background: rgba(255, 255, 255, 0.05);
+        backdrop-filter: blur(10px);
+        border: 1px solid rgba(255, 255, 255, 0.1);
+        padding: 1.5rem;
+        border-radius: 15px;
+        margin: 1rem 0;
+        color: var(--text-color);
+    }
+    .source-doc {
+        background: rgba(31, 119, 180, 0.1);
+        backdrop-filter: blur(5px);
+        padding: 1rem;
+        border-left: 4px solid #1f77b4;
+        border-radius: 8px;
+        margin: 0.5rem 0;
+        color: var(--text-color);
+    }
+    .share-link {
+        background: rgba(46, 204, 113, 0.1);
+        backdrop-filter: blur(5px);
+        padding: 1rem;
+        border-radius: 10px;
+        border-left: 4px solid #2ecc71;
+        color: var(--text-color);
+    }
+    /* Model indicator boxes */
+    .model-info {
+        background: rgba(52, 152, 219, 0.15);
+        backdrop-filter: blur(10px);
+        padding: 15px;
+        border-radius: 12px;
+        border-left: 4px solid #3498db;
+        margin: 10px 0;
+    }
+    /* Language selection enhancement */
+    .language-selection {
+        background: rgba(155, 89, 182, 0.1);
+        backdrop-filter: blur(10px);
+        padding: 15px;
+        border-radius: 12px;
+        border-left: 4px solid #9b59b6;
+        margin: 10px 0;
+    }
+    /* Upload area enhancement */
+    .stFileUploader {
+        background: rgba(230, 126, 34, 0.1);
+        backdrop-filter: blur(10px);
+        padding: 20px;
+        border-radius: 15px;
+        border: 2px dashed #e67e22;
+    }
+    .stFileUploader label {
+        font-size: 1.2rem;
+        font-weight: bold;
+        color: var(--text-color);
+    }
+    /* Button enhancements */
+    .stButton > button {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        border: none;
+        border-radius: 10px;
+        padding: 0.6rem 1.5rem;
+        font-weight: 600;
+        transition: all 0.3s ease;
+        box-shadow: 0 4px 15px 0 rgba(31, 38, 135, 0.37);
+    }
+    .stButton > button:hover {
+        transform: translateY(-2px);
+        box-shadow: 0 6px 20px 0 rgba(31, 38, 135, 0.5);
+    }
+    /* Sidebar enhancements */
+    .css-1d391kg {
+        background: rgba(255, 255, 255, 0.02);
+        backdrop-filter: blur(10px);
+    }
+    /* Info boxes */
+    .stInfo {
+        background: rgba(52, 152, 219, 0.1);
+        backdrop-filter: blur(10px);
+        border-left: 4px solid #3498db;
+    }
+    .stSuccess {
+        background: rgba(46, 204, 113, 0.1);
+        backdrop-filter: blur(10px);
+        border-left: 4px solid #2ecc71;
+    }
+    .stWarning {
+        background: rgba(241, 196, 15, 0.1);
+        backdrop-filter: blur(10px);
+        border-left: 4px solid #f1c40f;
+    }
+    .stError {
+        background: rgba(231, 76, 60, 0.1);
+        backdrop-filter: blur(10px);
+        border-left: 4px solid #e74c3c;
+    }
+</style>
+""", unsafe_allow_html=True)
+def main():
+    # Check for shared query in URL
+    query_params = st.query_params
+    shared_query_id = query_params.get("share", [None])[0]
+    if shared_query_id:
+        display_shared_query(shared_query_id)
+        return
+    # Main header
+    st.markdown("""
+    <div class="main-header">
+        <h1>🎓 PanSea University Search</h1>
+        <p>AI-Powered Study Search Platform for ASEAN Universities</p>
+    </div>
+    """, unsafe_allow_html=True)
+    # Sidebar
+    with st.sidebar:
+        st.header("📋 Navigation")
+        page = st.selectbox(
+            "Choose a page:",
+            ["🔍 Search Universities", "📄 Upload Documents", "ℹ️ About"]
+        )
+        # Show embedding model info
+        st.markdown("---")
+        try:
+            from rag_system import RAGSystem
+            temp_rag = RAGSystem()
+            if hasattr(temp_rag.embeddings, 'model') and temp_rag.embeddings.model:
+                st.markdown("""
+                <div class='language-selection'>
+                    <h5 style='margin: 0; color: #9b59b6;'>🔧 Embedding Model</h5>
+                    <p style='margin: 5px 0; font-size: 0.9em;'>BGE-small-en-v1.5</p>
+                </div>
+                """, unsafe_allow_html=True)
+            else:
+                st.markdown("""
+                <div class='language-selection'>
+                    <h5 style='margin: 0; color: #9b59b6;'>🔧 Embedding Model</h5>
+                    <p style='margin: 5px 0; font-size: 0.9em;'>OpenAI Ada-002</p>
+                </div>
+                """, unsafe_allow_html=True)
+        except:
+            pass
+    # Main content based on selected page
+    if page == "📄 Upload Documents":
+        upload_documents_page()
+    elif page == "ℹ️ About":
+        about_page()
+    else:
+        search_page()
+def upload_documents_page():
+    st.header("📄 Upload University Documents")
+    st.write("Upload PDF documents containing university admission requirements, fees, and program information.")
+    col1, col2 = st.columns(2)
+    with col1:
+        university_name = st.text_input("🏫 University Name", placeholder="e.g., National University of Singapore")
+        country = st.selectbox(
+            "🌏 Country",
+            ["", "Singapore", "Malaysia", "Thailand", "Indonesia", "Philippines", "Vietnam", "Brunei", "Cambodia", "Laos", "Myanmar"]
+        )
+    with col2:
+        document_type = st.selectbox(
+            "📋 Document Type",
+            ["admission_requirements", "tuition_fees", "program_catalog", "application_guide", "scholarship_info"]
+        )
+        language = st.selectbox(
+            "🌐 Primary Language",
+            ["English", "Chinese", "Malay", "Thai", "Indonesian", "Vietnamese", "Filipino", "Other"]
+        )
+    # File upload
+    uploaded_files = st.file_uploader(
+        "Choose PDF files",
+        accept_multiple_files=True,
+        type=['pdf'],
+        help="Select one or more PDF files to upload"
+    )
+    if uploaded_files and st.button("🚀 Process Documents", type="primary"):
+        if not university_name or not country:
+            st.error("Please provide university name and country.")
+            return
+        with st.spinner("Processing documents... This may take a few minutes."):
+            try:
+                # Initialize document ingestion
+                doc_ingestion = DocumentIngestion()
+                # Process documents
+                documents = doc_ingestion.process_documents(
+                    uploaded_files, university_name, country, document_type
+                )
+                if documents:
+                    # Create or update vector store
+                    vectorstore = doc_ingestion.create_vector_store(documents)
+                    if vectorstore:
+                        st.success(f"✅ Successfully processed {len(documents)} documents!")
+                        st.info(f"Documents from {university_name} ({country}) have been added to the knowledge base.")
+                        # Show processed files
+                        with st.expander("📋 Processed Files"):
+                            for doc in documents:
+                                st.write(f"• **{doc.metadata['source']}**")
+                                st.write(f"  - University: {doc.metadata['university']}")
+                                st.write(f"  - Country: {doc.metadata['country']}")
+                                st.write(f"  - Type: {doc.metadata['document_type']}")
+                                st.write("---")
+                else:
+                    st.error("No documents were successfully processed.")
+            except Exception as e:
+                st.error(f"Error processing documents: {str(e)}")
+def search_page():
+    st.header("🔍 Search University Information")
+    # Language selection
+    col1, col2 = st.columns([3, 1])
+    with col1:
+        st.write("Ask questions about university admissions, requirements, fees, and programs:")
+    with col2:
+        response_language = st.selectbox(
+            "Response Language",
+            ["English", "中文 (Chinese)", "Bahasa Malaysia", "ไทย (Thai)", "Bahasa Indonesia", "Tiếng Việt (Vietnamese)"],
+            key="response_language"
+        )
+    # Show language info
+    language_map = {
+        "English": "English",
+        "中文 (Chinese)": "Chinese",
+        "Bahasa Malaysia": "Malay",
+        "ไทย (Thai)": "Thai",
+        "Bahasa Indonesia": "Indonesian",
+        "Tiếng Việt (Vietnamese)": "Vietnamese"
+    }
+    selected_lang = language_map.get(response_language, "English")
+    if selected_lang != "English":
+        st.info(f"🌐 AI will respond in **{selected_lang}** based on your selection")
+    # Example queries with model indicators
+    st.markdown("**💡 Example queries:**")
+    # Add model selection explanation
+    st.markdown("""
+    <div class='model-info'>
+        <h4 style='margin: 0; color: #3498db;'>🤖 AI Model Selection</h4>
+        <p style='margin: 5px 0;'><strong>🧠 Reasoning Model (SEA-LION v3.5):</strong> Complex university searches with multiple criteria, comparisons, budget constraints</p>
+        <p style='margin: 5px 0;'><strong>⚡ Instruct Model (SEA-LION v3):</strong> Simple questions, translations, definitions, basic information</p>
+        <p style='margin: 5px 0; font-style: italic;'>The system automatically chooses the best model for your query!</p>
+    </div>
+    """, unsafe_allow_html=True)
+    col1, col2 = st.columns(2)
+    with col1:
+        st.markdown("**🧠 Complex Queries (Uses Reasoning Model):**")
+        complex_examples = [
+            "Show me universities in Malaysia for master's degrees with tuition under 40,000 RMB per year",
+            "专科毕业，无雅思，想在马来西亚读硕士，学费不超过4万人民币/年",
+            "Compare engineering programs in Thailand and Singapore under $15,000 per year",
+            "Find MBA programs in ASEAN with GMAT requirements and scholarships available"
+        ]
+        for example in complex_examples:
+            st.markdown(f"• {example}")
+    with col2:
+        st.markdown("**⚡ Simple Queries (Uses Instruct Model):**")
+        simple_examples = [
+            "What does IELTS stand for?",
+            "Translate 'application deadline' to Chinese",
+            "What is the difference between bachelor and master degree?",
+            "How to say 'university' in Thai?"
+        ]
+        for example in simple_examples:
+            st.markdown(f"• {example}")
+    st.markdown("---")  # Separator line
+    # Query input - main input field (always available)
+    query = st.text_area(
+        "Your question:",
+        height=100,
+        placeholder="e.g., What are the admission requirements for computer science programs in Singapore?",
+        help="Type your question here or select an example below to get started."
+    )
+    # Show search status
+    if query.strip():
+        st.success("✅ Ready to search! Click the search button when you're ready.")
+    else:
+        st.info("💭 Enter your question in the text box above to start searching.")
+    # Optional: Quick example selection (just for convenience)
+    with st.expander("💡 Example Queries (Click to Use)"):
+        # Combine all examples
+        all_examples = complex_examples + simple_examples
+        col1, col2 = st.columns(2)
+        with col1:
+            st.markdown("**🧠 Complex Examples:**")
+            for example in complex_examples:
+                if st.button(example[:60] + "...", key=f"ex_{hash(example)}", help=f"Click to use: {example}"):
+                    st.session_state.example_query = example
+        with col2:
+            st.markdown("**⚡ Simple Examples:**")
+            for example in simple_examples:
+                if st.button(example[:60] + "...", key=f"ex_{hash(example)}", help=f"Click to use: {example}"):
+                    st.session_state.example_query = example
+    # Use selected example if any
+    if hasattr(st.session_state, 'example_query') and st.session_state.example_query:
+        query = st.session_state.example_query
+        st.info(f"📝 Using example: {query[:100]}...")
+        # Clear the example after use
+        del st.session_state.example_query
+    # Additional filters
+    with st.expander("🔧 Advanced Filters (Optional)"):
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            budget_range = st.select_slider(
+                "Budget Range (USD/year)",
+                options=["Any", "<10k", "10k-20k", "20k-30k", "30k-40k", ">40k"],
+                value="Any"
+            )
+        with col2:
+            study_level = st.multiselect(
+                "Study Level",
+                ["Diploma", "Bachelor", "Master", "PhD"],
+                default=[]
+            )
+        with col3:
+            preferred_countries = st.multiselect(
+                "Preferred Countries",
+                ["Singapore", "Malaysia", "Thailand", "Indonesia", "Philippines", "Vietnam", "Brunei"],
+                default=[]
+            )
+    # Search button - enabled as soon as there's text in the query
+    search_disabled = not query.strip()
+    button_text = "🔍 Search" if not search_disabled else "🔍 Search (Enter a question first)"
+    if st.button(button_text, type="primary", disabled=search_disabled):
+        if not query.strip():
+            st.error("Please enter a question.")
+            return
+        # Get the language code for processing
+        language_map = {
+            "English": "English",
+            "中文 (Chinese)": "Chinese",
+            "Bahasa Malaysia": "Malay",
+            "ไทย (Thai)": "Thai",
+            "Bahasa Indonesia": "Indonesian",
+            "Tiếng Việt (Vietnamese)": "Vietnamese"
+        }
+        language_code = language_map.get(response_language, "English")
+        with st.spinner("Searching for information..."):
+            try:
+                # Initialize RAG system
+                rag_system = RAGSystem()
+                # Show which model will be used
+                from rag_system import classify_query_type
+                query_type = classify_query_type(query)
+                if query_type == "complex":
+                    st.info("🧠 **Using SEA-LION Reasoning Model (v3.5)** - Complex query detected")
+                else:
+                    st.info("⚡ **Using SEA-LION Instruct Model (v3)** - Simple query/translation detected")
+                # Show translation status if not English
+                if response_language != "English":
+                    st.info(f"🌐 **Translating response to {response_language}**")
+                # Add filters to query if specified
+                enhanced_query = query
+                if budget_range != "Any" or study_level or preferred_countries:
+                    filters = []
+                    if budget_range != "Any":
+                        filters.append(f"budget range: {budget_range}")
+                    if study_level:
+                        filters.append(f"study levels: {', '.join(study_level)}")
+                    if preferred_countries:
+                        filters.append(f"countries: {', '.join(preferred_countries)}")
+                    enhanced_query += f"\n\nAdditional filters: {'; '.join(filters)}"
+                # Get response
+                result = rag_system.query(enhanced_query, language_code)
+                if result:
+                    # Save query result for sharing
+                    save_query_result(result)
+                    # Display results
+                    display_query_result(result, show_share_link=True)
+                else:
+                    st.error("No results found. Try rephrasing your question or upload more documents.")
+            except Exception as e:
+                st.error(f"Error searching: {str(e)}")
+def display_query_result(result, show_share_link=False):
+    """Display query results in a formatted way."""
+    st.markdown('<div class="query-result">', unsafe_allow_html=True)
+    # Show which model was used
+    if result.get("model_used"):
+        st.info(f"🤖 **Model Used:** {result['model_used']}")
+    st.subheader("🎯 Answer")
+    st.write(result["answer"])
+    # Share link
+    if show_share_link and result.get("query_id"):
+        st.markdown("---")
+        current_url = st.get_option("browser.serverAddress") or "localhost:8501"
+        share_url = f"http://{current_url}?share={result['query_id']}"
+        st.markdown(f"""
+        <div class="share-link">
+            <strong>🔗 Share this result:</strong><br>
+            <code>{share_url}</code>
+        </div>
+        """, unsafe_allow_html=True)
+        if st.button("📋 Copy Share Link"):
+            st.code(share_url)
+    # Source documents
+    if result.get("source_documents"):
+        st.markdown("---")
+        st.subheader("📚 Sources")
+        for i, doc in enumerate(result["source_documents"], 1):
+            with st.expander(f"Source {i}: {doc.metadata.get('source', 'Unknown')}"):
+                col1, col2 = st.columns([1, 2])
+                with col1:
+                    st.write(f"**University:** {doc.metadata.get('university', 'Unknown')}")
+                    st.write(f"**Country:** {doc.metadata.get('country', 'Unknown')}")
+                    st.write(f"**Type:** {doc.metadata.get('document_type', 'Unknown')}")
+                with col2:
+                    st.write("**Relevant Content:**")
+                    content_preview = doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content
+                    st.write(content_preview)
+    st.markdown('</div>', unsafe_allow_html=True)
+def display_shared_query(query_id):
+    """Display a shared query result."""
+    st.header("🔗 Shared Query Result")
+    result_data = load_shared_query(query_id)
+    if result_data:
+        st.info(f"**Original Question:** {result_data['question']}")
+        st.write(f"**Language:** {result_data['language']}")
+        st.write(f"**Date:** {result_data['timestamp'][:10]}")
+        # Create a mock result object for display
+        mock_result = {
+            "answer": result_data["answer"],
+            "source_documents": [
+                type('MockDoc', (), {
+                    'metadata': source,
+                    'page_content': source.get('content_preview', '')
+                })() for source in result_data.get('sources', [])
+            ]
+        }
+        display_query_result(mock_result, show_share_link=False)
+        if st.button("🔍 Ask Your Own Question"):
+            st.experimental_set_query_params()
+            st.experimental_rerun()
+    else:
+        st.error("❌ Shared query not found or has expired.")
+        if st.button("🏠 Go to Home"):
+            st.experimental_set_query_params()
+            st.experimental_rerun()
+def about_page():
+    st.header("ℹ️ About PanSea University Search")
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        st.markdown("""
+        ### 🎯 Problem We Solve
+        Prospective students worldwide seeking to study abroad face difficulty finding accurate, up-to-date university admission requirements. Information is scattered across PDFs, brochures, and outdated agency websites. Many waste time applying to unsuitable programs due to missing criteria and pay high agent fees.
+        ### 💡 Our Solution
+        PanSea is an LLM-powered, RAG-based study search platform powered by **SEA-LION models** that ingests official admissions documents from ASEAN universities. Students can query in any ASEAN language and receive:
+        - 📋 **Ranked program matches** with detailed requirements
+        - 💰 **Tuition fees and costs**
+        - 📅 **Application deadlines and windows**
+        - 🎓 **Entry requirements and prerequisites**
+        - 📖 **Source citations** from official documents
+        ### 🤖 AI Models Used
+        - **SEA-LION v3.5 Reasoning Model**: For complex university search queries requiring multi-step reasoning
+        - **SEA-LION v3 Instruct Model**: For translation and simple question-answering
+        - **Automatic Model Selection**: The system intelligently chooses the appropriate model based on query complexity
+        ### 🌏 Supported Languages
+        - English
+        - 中文 (Chinese)
+        - Bahasa Malaysia
+        - ไทย (Thai)
+        - Bahasa Indonesia
+        - Tiếng Việt (Vietnamese)
+        - Filipino
+        ### 🔧 How It Works
+        1. **📄 Document Ingestion**: Upload official PDF documents from universities
+        2. **🔍 AI Processing**: Our system processes and indexes the content
+        3. **❓ Natural Language Queries**: Ask questions in your preferred language
+        4. **🎯 Intelligent Answers**: Get relevant, sourced responses
+        5. **🔗 Share Results**: Generate shareable links for your queries
+        """)
+    with col2:
+        st.markdown("""
+        ### 📊 Features
+        ✅ **Multi-language support**
+        ✅ **PDF document ingestion**
+        ✅ **Intelligent search & retrieval**
+        ✅ **Source citations**
+        ✅ **Shareable query results**
+        ✅ **Advanced filtering**
+        ✅ **Real-time processing**
+        ### 🏛️ Target Universities
+        - 🇸🇬 Singapore
+        - 🇲🇾 Malaysia
+        - 🇹🇭 Thailand
+        - 🇮🇩 Indonesia
+        - 🇵🇭 Philippines
+        - 🇻🇳 Vietnam
+        - 🇧🇳 Brunei
+        - 🇰🇭 Cambodia
+        - 🇱🇦 Laos
+        - 🇲🇲 Myanmar
+        ### 🚀 Get Started
+        1. Go to **Upload Documents** to add university PDFs
+        2. Use **Search Universities** to ask questions
+        3. Share your results with others!
+        """)
+if __name__ == "__main__":
+    # Check if SEA-LION API key is set
+    if not os.getenv("SEA_LION_API_KEY"):
+        st.error("🚨 SEA-LION API Key not found! Please set your SEA_LION_API_KEY in the .env file.")
+        st.code("SEA_LION_API_KEY=your_api_key_here")
+        st.stop()
+    # Check if OpenAI API key is set (needed for embeddings)
+    if not os.getenv("OPENAI_API_KEY") or os.getenv("OPENAI_API_KEY") == "your_openai_api_key_here":
+        st.warning("⚠️ OpenAI API Key not configured properly. You'll need it for document embeddings.")
+        st.info("The system will use SEA-LION models for text generation, but OpenAI for document embeddings.")
+    main()

rag_system.py ADDED Viewed

	@@ -0,0 +1,460 @@

+import os
+import uuid
+import tempfile
+from typing import List, Optional, Dict, Any
+import streamlit as st
+from pathlib import Path
+import PyPDF2
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+from langchain_community.vectorstores import Chroma
+from langchain.chains import RetrievalQA
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.schema import Document
+from dotenv import load_dotenv
+import chromadb
+from datetime import datetime
+import json
+import base64
+from openai import OpenAI
+import re
+# Load environment variables
+load_dotenv()
+class AlternativeEmbeddings:
+    """Alternative embeddings using Sentence Transformers when OpenAI is not available"""
+    def __init__(self):
+        try:
+            from sentence_transformers import SentenceTransformer
+            # Use BGE-small-en for better performance
+            self.model = SentenceTransformer('BAAI/bge-small-en-v1.5')
+            self.embedding_size = 384
+        except ImportError:
+            st.error("sentence-transformers not available. Please install it or provide OpenAI API key.")
+            self.model = None
+    def embed_documents(self, texts):
+        if not self.model:
+            return []
+        return self.model.encode(texts).tolist()
+    def embed_query(self, text):
+        if not self.model:
+            return []
+        return self.model.encode([text])[0].tolist()
+class SEALionLLM:
+    """Custom LLM class for SEA-LION models"""
+    def __init__(self):
+        self.client = OpenAI(
+            api_key=os.getenv("SEA_LION_API_KEY"),
+            base_url=os.getenv("SEA_LION_BASE_URL", "https://api.sea-lion.ai/v1")
+        )
+        # Model configurations
+        self.instruct_model = "aisingapore/Gemma-SEA-LION-v3-9B-IT"
+        self.reasoning_model = "aisingapore/Llama-SEA-LION-v3.5-8B-R"
+    def _is_complex_query(self, query: str) -> bool:
+        """Determine if query requires reasoning model or simple instruct model"""
+        # Keywords that indicate complex university search queries
+        complex_keywords = [
+            "university", "admission", "requirement", "tuition", "fee", "program", "course",
+            "degree", "master", "bachelor", "phd", "scholarship", "deadline", "application",
+            "budget", "under", "less than", "below", "compare", "recommend", "suggest",
+            "which", "what are the", "show me", "find me", "search for",
+            # Chinese keywords
+            "大学", "学费", "专业", "硕士", "学士", "博士", "申请", "要求", "奖学金",
+            # Malay keywords
+            "universiti", "yuran", "program", "ijazah", "syarat", "permohonan",
+            # Thai keywords
+            "มหาวิทยาลัย", "ค่าเล่าเรียน", "หลักสูตร", "ปริญญา", "เงื่อนไข",
+            # Indonesian keywords
+            "universitas", "biaya", "kuliah", "program", "sarjana", "persyaratan"
+        ]
+        # Check for multiple criteria (indicates complex search)
+        criteria_count = 0
+        query_lower = query.lower()
+        for keyword in complex_keywords:
+            if keyword.lower() in query_lower:
+                criteria_count += 1
+        # Also check for comparison words, numbers, conditions
+        comparison_patterns = [
+            r'under \$?\d+', r'less than \$?\d+', r'below \$?\d+', r'between \$?\d+ and \$?\d+',
+            r'不超过.*元', r'低于.*元', r'少于.*元',  # Chinese
+            r'kurang dari', r'di bawah',  # Malay/Indonesian
+            r'น้อยกว่า', r'ต่ำกว่า'  # Thai
+        ]
+        for pattern in comparison_patterns:
+            if re.search(pattern, query_lower):
+                criteria_count += 2
+        # Complex query if multiple keywords or comparison patterns found
+        return criteria_count >= 2
+    def _is_translation_query(self, query: str) -> bool:
+        """Check if query is primarily for translation"""
+        translation_keywords = [
+            "translate", "translation", "แปล", "翻译", "terjemah", "traduire"
+        ]
+        query_lower = query.lower()
+        return any(keyword in query_lower for keyword in translation_keywords)
+    def generate_response(self, query: str, context: str = "", language: str = "English") -> str:
+        """Generate response using appropriate SEA-LION model"""
+        # Choose model based on query complexity
+        if self._is_translation_query(query) or not self._is_complex_query(query):
+            model = self.instruct_model
+            use_reasoning = False
+        else:
+            model = self.reasoning_model
+            use_reasoning = True
+        # Prepare messages
+        system_prompt = f"""You are a helpful assistant specializing in ASEAN university admissions.
+        Respond in {language} unless specifically asked otherwise.
+        If provided with context from university documents, use that information to give accurate, specific answers.
+        Always cite your sources when using provided context.
+        For complex university search queries, provide:
+        1. Direct answers to the question
+        2. Relevant admission requirements
+        3. Tuition fees (if available)
+        4. Application deadlines (if available)
+        5. Source citations from the documents
+        Context: {context}"""
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": query}
+        ]
+        try:
+            if use_reasoning:
+                # Use reasoning model with thinking mode
+                response = self.client.chat.completions.create(
+                    model=model,
+                    messages=messages,
+                    max_tokens=2000,
+                    temperature=0.1,
+                    extra_body={"thinking_mode": True}
+                )
+            else:
+                # Use instruct model for simpler queries
+                response = self.client.chat.completions.create(
+                    model=model,
+                    messages=messages,
+                    max_tokens=1500,
+                    temperature=0.3
+                )
+            return response.choices[0].message.content
+        except Exception as e:
+            st.error(f"Error with SEA-LION model: {str(e)}")
+            # Fallback to a simple response
+            return f"I apologize, but I encountered an error processing your query. Please try rephrasing your question. Error: {str(e)}"
+def classify_query_type(query: str) -> str:
+    """Public function to classify query type for UI display"""
+    # Create a temporary SEALionLLM instance just for classification
+    temp_llm = SEALionLLM()
+    if temp_llm._is_translation_query(query) or not temp_llm._is_complex_query(query):
+        return "simple"
+    else:
+        return "complex"
+class DocumentIngestion:
+    def __init__(self):
+        # Use BGE embeddings by default for better performance
+        try:
+            self.embeddings = AlternativeEmbeddings()
+            self.embedding_type = "BGE-small-en"
+            if not self.embeddings.model:
+                raise Exception("BGE model not available")
+        except Exception:
+            # Fallback to OpenAI if BGE not available
+            openai_key = os.getenv("OPENAI_API_KEY")
+            if openai_key and openai_key != "placeholder_for_embeddings" and openai_key != "your_openai_api_key_here":
+                try:
+                    self.embeddings = OpenAIEmbeddings()
+                    self.embedding_type = "OpenAI"
+                except Exception as e:
+                    st.error("Both BGE and OpenAI embeddings failed. Please check your setup.")
+                    raise e
+            else:
+                st.error("No embedding model available. Please install sentence-transformers or provide OpenAI API key.")
+                raise Exception("No embedding model available")
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
+            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
+        )
+        self.persist_directory = os.getenv("CHROMA_PERSIST_DIRECTORY", "./chroma_db")
+        os.makedirs(self.persist_directory, exist_ok=True)
+    def extract_text_from_pdf(self, pdf_file) -> str:
+        """Extract text from uploaded PDF file with multiple fallback methods."""
+        try:
+            # Method 1: Try with PyPDF2 (handles most PDFs including encrypted ones with PyCryptodome)
+            pdf_reader = PyPDF2.PdfReader(pdf_file)
+            # Check if PDF is encrypted
+            if pdf_reader.is_encrypted:
+                # Try to decrypt with empty password (common for protected but not password-protected PDFs)
+                try:
+                    pdf_reader.decrypt("")
+                except Exception:
+                    st.warning(f"PDF {pdf_file.name} is password-protected. Please provide an unprotected version.")
+                    return ""
+            text = ""
+            for page_num, page in enumerate(pdf_reader.pages):
+                try:
+                    page_text = page.extract_text()
+                    text += page_text + "\n"
+                except Exception as e:
+                    st.warning(f"Could not extract text from page {page_num + 1} of {pdf_file.name}: {str(e)}")
+                    continue
+            if text.strip():
+                return text
+            else:
+                st.warning(f"No extractable text found in {pdf_file.name}. This might be a scanned PDF or image-based document.")
+                return ""
+        except Exception as e:
+            error_msg = str(e)
+            if "PyCryptodome" in error_msg:
+                st.error(f"Encryption error with {pdf_file.name}: {error_msg}")
+                st.info("💡 The PDF uses encryption. PyCryptodome has been installed to handle this.")
+            elif "password" in error_msg.lower():
+                st.error(f"Password-protected PDF: {pdf_file.name}")
+                st.info("💡 Please provide an unprotected version of this PDF.")
+            else:
+                st.error(f"Error extracting text from {pdf_file.name}: {error_msg}")
+            return ""
+    def process_documents(self, uploaded_files, university_name: str = "",
+                         country: str = "", document_type: str = "admission_requirements") -> List[Document]:
+        """Process uploaded PDF files and convert to documents."""
+        documents = []
+        processed_count = 0
+        failed_count = 0
+        st.info(f"📄 Processing {len(uploaded_files)} document(s)...")
+        for uploaded_file in uploaded_files:
+            if uploaded_file.type == "application/pdf":
+                st.write(f"🔍 Extracting text from: **{uploaded_file.name}**")
+                # Extract text
+                text = self.extract_text_from_pdf(uploaded_file)
+                if text.strip():
+                    # Create metadata
+                    metadata = {
+                        "source": uploaded_file.name,
+                        "university": university_name,
+                        "country": country,
+                        "document_type": document_type,
+                        "upload_timestamp": datetime.now().isoformat(),
+                        "file_id": str(uuid.uuid4())
+                    }
+                    # Create document
+                    doc = Document(
+                        page_content=text,
+                        metadata=metadata
+                    )
+                    documents.append(doc)
+                    processed_count += 1
+                    st.success(f"✅ Successfully processed: **{uploaded_file.name}** ({len(text)} characters)")
+                else:
+                    failed_count += 1
+                    st.warning(f"⚠️ Could not extract text from **{uploaded_file.name}**")
+            else:
+                failed_count += 1
+                st.error(f"❌ Unsupported file type: **{uploaded_file.type}** for {uploaded_file.name}")
+        # Summary
+        if processed_count > 0:
+            st.success(f"🎉 Successfully processed **{processed_count}** document(s)")
+        if failed_count > 0:
+            st.warning(f"⚠️ Failed to process **{failed_count}** document(s)")
+        return documents
+    def create_vector_store(self, documents: List[Document]) -> Chroma:
+        """Create and persist vector store from documents."""
+        if not documents:
+            st.error("No documents to process")
+            return None
+        # Split documents into chunks
+        texts = self.text_splitter.split_documents(documents)
+        # Create vector store
+        vectorstore = Chroma.from_documents(
+            documents=texts,
+            embedding=self.embeddings,
+            persist_directory=self.persist_directory
+        )
+        return vectorstore
+    def load_existing_vectorstore(self) -> Optional[Chroma]:
+        """Load existing vector store if it exists."""
+        try:
+            vectorstore = Chroma(
+                persist_directory=self.persist_directory,
+                embedding_function=self.embeddings
+            )
+            return vectorstore
+        except Exception as e:
+            st.warning(f"Could not load existing vector store: {str(e)}")
+            return None
+class RAGSystem:
+    def __init__(self):
+        # Initialize embeddings - try BGE first, fallback to OpenAI
+        try:
+            self.embeddings = AlternativeEmbeddings()
+            if not self.embeddings.model:
+                # Fallback to OpenAI if BGE not available
+                self.embeddings = OpenAIEmbeddings()
+        except Exception:
+            # If both fail, use OpenAI as last resort
+            self.embeddings = OpenAIEmbeddings()
+        self.sea_lion_llm = SEALionLLM()
+        self.persist_directory = os.getenv("CHROMA_PERSIST_DIRECTORY", "./chroma_db")
+    def get_vectorstore(self) -> Optional[Chroma]:
+        """Get the vector store."""
+        try:
+            vectorstore = Chroma(
+                persist_directory=self.persist_directory,
+                embedding_function=self.embeddings
+            )
+            return vectorstore
+        except Exception as e:
+            st.error(f"Error loading vector store: {str(e)}")
+            return None
+    def query(self, question: str, language: str = "English") -> Dict[str, Any]:
+        """Query the RAG system using SEA-LION models."""
+        vectorstore = self.get_vectorstore()
+        if not vectorstore:
+            return {
+                "answer": "No documents have been ingested yet. Please upload some PDF documents first.",
+                "source_documents": [],
+                "query_id": None
+            }
+        try:
+            # Retrieve relevant documents
+            retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
+            relevant_docs = retriever.get_relevant_documents(question)
+            # Prepare context from retrieved documents
+            context_parts = []
+            for i, doc in enumerate(relevant_docs, 1):
+                source_info = doc.metadata.get('source', 'Unknown')
+                university = doc.metadata.get('university', 'Unknown')
+                country = doc.metadata.get('country', 'Unknown')
+                context_parts.append(f"""
+Document {i} (Source: {source_info}, University: {university}, Country: {country}):
+{doc.page_content[:500]}...
+""")
+            context = "\n".join(context_parts)
+            # Generate response using SEA-LION model
+            answer = self.sea_lion_llm.generate_response(
+                query=question,
+                context=context,
+                language=language
+            )
+            # Generate query ID for sharing
+            query_id = str(uuid.uuid4())
+            return {
+                "answer": answer,
+                "source_documents": relevant_docs,
+                "query_id": query_id,
+                "original_question": question,
+                "language": language,
+                "model_used": "SEA-LION" + (" Reasoning" if self.sea_lion_llm._is_complex_query(question) else " Instruct")
+            }
+        except Exception as e:
+            st.error(f"Error querying system: {str(e)}")
+            return {
+                "answer": f"Error processing your question: {str(e)}",
+                "source_documents": [],
+                "query_id": None
+            }
+def save_query_result(query_result: Dict[str, Any]):
+    """Save query result for sharing."""
+    if query_result.get("query_id"):
+        results_dir = "query_results"
+        os.makedirs(results_dir, exist_ok=True)
+        result_file = f"{results_dir}/{query_result['query_id']}.json"
+        # Prepare data for saving (remove non-serializable objects)
+        save_data = {
+            "query_id": query_result["query_id"],
+            "question": query_result.get("original_question", ""),
+            "answer": query_result["answer"],
+            "language": query_result.get("language", "English"),
+            "timestamp": datetime.now().isoformat(),
+            "sources": [
+                {
+                    "source": doc.metadata.get("source", "Unknown"),
+                    "university": doc.metadata.get("university", "Unknown"),
+                    "country": doc.metadata.get("country", "Unknown"),
+                    "content_preview": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
+                }
+                for doc in query_result.get("source_documents", [])
+            ]
+        }
+        try:
+            with open(result_file, 'w', encoding='utf-8') as f:
+                json.dump(save_data, f, indent=2, ensure_ascii=False)
+            return True
+        except Exception as e:
+            st.error(f"Error saving query result: {str(e)}")
+            return False
+    return False
+def load_shared_query(query_id: str) -> Optional[Dict[str, Any]]:
+    """Load a shared query result."""
+    result_file = f"query_results/{query_id}.json"
+    if os.path.exists(result_file):
+        try:
+            with open(result_file, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        except Exception as e:
+            st.error(f"Error loading shared query: {str(e)}")
+    return None

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+langchain==0.1.0
+langchain-community==0.0.10
+langchain-openai==0.0.5
+streamlit==1.29.0
+pypdf2==3.0.1
+pycryptodome==3.23.0
+chromadb==0.4.22
+sentence-transformers==5.1.0
+faiss-cpu==1.7.4
+python-dotenv==1.0.0
+openai==1.6.1
+tiktoken==0.5.2
+streamlit-extras==0.3.5
+watchdog==3.0.0

requirements_clean.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+langchain
+langchain-community
+langchain-openai
+streamlit
+pypdf2
+chromadb
+sentence-transformers
+python-dotenv
+openai
+tiktoken
+streamlit-extras

sample_documents/sample_university_requirements.txt ADDED Viewed

	@@ -0,0 +1,177 @@

+# Sample University Admission Requirements
+## National University of Singapore (NUS) - Computer Science Master's Program
+### Program Overview
+The Master of Computing (Computer Science) program at NUS is a comprehensive graduate program designed for students seeking advanced knowledge in computer science.
+### Admission Requirements
+#### Academic Requirements
+- Bachelor's degree in Computer Science, Computer Engineering, or related field
+- Minimum GPA of 3.5/4.0 or equivalent (Second Class Upper Honours)
+- Strong background in mathematics and programming
+#### English Proficiency Requirements
+For international students whose native language is not English:
+- IELTS: Minimum overall score of 6.5 (no band less than 6.0)
+- TOEFL iBT: Minimum score of 85 (writing minimum 22)
+- PTE Academic: Minimum score of 65
+#### Additional Requirements
+- Statement of Purpose (500-1000 words)
+- Two letters of recommendation from academic or professional referees
+- Resume/CV highlighting relevant experience
+- Portfolio of programming projects (preferred)
+### Tuition Fees (2024-2025 Academic Year)
+- Singapore Citizens: S$12,500 per year
+- Singapore Permanent Residents: S$17,500 per year
+- International Students: S$25,000 per year
+### Application Deadlines
+- **Priority Round**: November 15, 2024
+- **Regular Round**: January 31, 2025
+- **Late Round**: March 15, 2025 (subject to availability)
+### Application Process
+1. Submit online application through NUS Graduate School portal
+2. Upload required documents
+3. Pay application fee of S$50
+4. Submit by deadline
+5. Attend interview if shortlisted (February-April)
+6. Admission results: April-May
+### Program Duration
+- Full-time: 1.5 years (3 semesters)
+- Part-time: 2.5 years (5 semesters)
+### Financial Aid
+- NUS Graduate Research Scholarship available for qualifying students
+- Teaching assistantships for outstanding applicants
+- Industry sponsorship opportunities
+### Contact Information
+- Email: [email protected]
+- Phone: +65 6516 2492
+- Website: www.nus.edu.sg/graduateschool
+---
+## University of Malaya (UM) - Engineering Master's Programs
+### Program Overview
+The Faculty of Engineering offers various Master's degree programs in engineering disciplines.
+### Admission Requirements
+#### Academic Requirements
+- Bachelor's degree in Engineering or related field with minimum CGPA of 3.0/4.0
+- For applicants with CGPA below 3.0, relevant work experience of at least 2 years required
+#### English Proficiency Requirements
+For international students:
+- IELTS: Minimum overall score of 6.0 (no band less than 5.5)
+- TOEFL iBT: Minimum score of 80
+- MUET (Malaysian University English Test): Band 4 minimum
+#### Program-Specific Requirements
+- **Civil Engineering**: AutoCAD proficiency preferred
+- **Electrical Engineering**: Basic knowledge of circuit analysis
+- **Mechanical Engineering**: Thermodynamics and fluid mechanics background
+### Tuition Fees (2024 Academic Year)
+- Malaysian Citizens: RM 8,000 per year
+- International Students: RM 15,000 per year
+- ASEAN Students: RM 12,000 per year (special rate)
+### Application Deadlines
+- **Main Intake (September)**: April 30, 2024
+- **Second Intake (February)**: October 31, 2024
+### Scholarships Available
+- UM Graduate Merit Scholarship (50% tuition fee waiver)
+- ASEAN Scholarship Program
+- Industry-sponsored scholarships
+### Living Costs (Estimated per month)
+- Accommodation: RM 500-800
+- Food: RM 400-600
+- Transportation: RM 100-200
+- Other expenses: RM 200-300
+- **Total: RM 1,200-1,900 per month**
+### Application Requirements
+1. Completed application form
+2. Academic transcripts
+3. Bachelor's degree certificate
+4. English proficiency test results
+5. Two reference letters
+6. Research proposal (for research-based programs)
+7. Passport copy
+8. Passport-sized photographs
+### Contact Information
+- Email: [email protected]
+- Phone: +603 7967 3026
+- Address: Faculty of Engineering, University of Malaya, 50603 Kuala Lumpur, Malaysia
+---
+## Chulalongkorn University - Business Administration Master's (MBA)
+### Program Overview
+The Chulalongkorn Business School MBA program is Thailand's premier business education program.
+### Admission Requirements
+#### Academic Requirements
+- Bachelor's degree from accredited institution
+- Minimum GPA of 2.75/4.0 or equivalent
+- GMAT score of 500+ (preferred) or GRE equivalent
+- Minimum 2 years of work experience
+#### English Proficiency Requirements
+- TOEFL iBT: Minimum score of 79
+- IELTS: Minimum overall score of 6.5
+- CU-TEP: Minimum score of 80
+### Tuition Fees (2024-2025)
+- Full-time MBA: 850,000 THB (approx. USD 25,000) total program
+- Executive MBA: 1,200,000 THB (approx. USD 35,000) total program
+### Application Deadlines
+- **Early Admission**: February 15, 2024
+- **Regular Admission**: April 30, 2024
+- **Final Round**: June 15, 2024
+### Program Duration
+- Full-time MBA: 16 months
+- Executive MBA: 18 months (weekend classes)
+### Scholarships
+- Merit-based scholarships up to 50% tuition
+- Corporate sponsorship opportunities
+- Government scholarships for ASEAN students
+### Application Process
+1. Online application submission
+2. Submit required documents
+3. GMAT/GRE scores
+4. Personal interview
+5. Group discussion assessment
+### Career Support
+- Career counseling services
+- Industry networking events
+- Internship placement assistance
+- Alumni network access
+### Contact Information
+- Email: [email protected]
+- Phone: +66 2 218 6601
+- Website: www.cbs.chula.ac.th
+---
+*This document contains sample admission information for demonstration purposes. Please verify all details with the respective universities before applying.*

start.sh ADDED Viewed

	@@ -0,0 +1,43 @@

+#!/bin/bash
+# PanSea University Search - Startup Script
+echo "🎓 Starting PanSea University Search..."
+# Check if virtual environment exists
+if [ ! -d ".venv" ]; then
+    echo "❌ Virtual environment not found. Please run setup first."
+    exit 1
+fi
+# Activate virtual environment
+source .venv/bin/activate
+# Check if .env file exists
+if [ ! -f ".env" ]; then
+    echo "⚠️  .env file not found. Please create one with your OpenAI API key."
+    echo "Example:"
+    echo "OPENAI_API_KEY=your_api_key_here"
+    exit 1
+fi
+# Create necessary directories
+mkdir -p chroma_db
+mkdir -p documents
+mkdir -p query_results
+# Check if required packages are installed
+echo "🔍 Checking dependencies..."
+python -c "import streamlit, langchain, chromadb" 2>/dev/null
+if [ $? -ne 0 ]; then
+    echo "❌ Dependencies not found. Installing..."
+    pip install -r requirements.txt
+fi
+echo "🚀 Starting Streamlit application..."
+echo "📱 Open your browser to: http://localhost:8501"
+echo "🛑 Press Ctrl+C to stop the application"
+echo ""
+# Start the Streamlit app
+streamlit run app.py --server.port=8501 --server.address=0.0.0.0

test_system.py ADDED Viewed

	@@ -0,0 +1,192 @@

+#!/usr/bin/env python3
+"""
+Test script for PanSea University Search System
+"""
+import os
+import sys
+from pathlib import Path
+# Add the project directory to Python path
+project_dir = Path(__file__).parent
+sys.path.insert(0, str(project_dir))
+def test_imports():
+    """Test if all required modules can be imported."""
+    print("🧪 Testing imports...")
+    try:
+        import streamlit
+        print("✅ Streamlit imported successfully")
+    except ImportError as e:
+        print(f"❌ Failed to import Streamlit: {e}")
+        return False
+    try:
+        import langchain
+        print("✅ LangChain imported successfully")
+    except ImportError as e:
+        print(f"❌ Failed to import LangChain: {e}")
+        return False
+    try:
+        import chromadb
+        print("✅ ChromaDB imported successfully")
+    except ImportError as e:
+        print(f"❌ Failed to import ChromaDB: {e}")
+        return False
+    try:
+        from rag_system import DocumentIngestion, RAGSystem
+        print("✅ RAG system modules imported successfully")
+    except ImportError as e:
+        print(f"❌ Failed to import RAG system: {e}")
+        return False
+    return True
+def test_environment():
+    """Test environment configuration."""
+    print("\n🔧 Testing environment...")
+    # Check if .env file exists
+    env_file = project_dir / ".env"
+    if not env_file.exists():
+        print("⚠️  .env file not found. You'll need to create one with your OpenAI API key.")
+        return False
+    # Load environment variables
+    try:
+        from dotenv import load_dotenv
+        load_dotenv()
+        # Check SEA-LION API key
+        sea_lion_key = os.getenv("SEA_LION_API_KEY")
+        if not sea_lion_key:
+            print("⚠️  SEA_LION_API_KEY not found in .env file")
+            return False
+        elif sea_lion_key == "your_sea_lion_api_key_here":
+            print("⚠️  Please update SEA_LION_API_KEY in .env file with your actual API key")
+            return False
+        else:
+            print(f"✅ SEA-LION API key configured (length: {len(sea_lion_key)})")
+        # Check OpenAI API key (for embeddings)
+        openai_key = os.getenv("OPENAI_API_KEY")
+        if not openai_key or openai_key == "your_openai_api_key_here":
+            print("⚠️  OpenAI API key not properly configured. This is needed for document embeddings.")
+            return False
+        else:
+            print(f"✅ OpenAI API key configured (length: {len(openai_key)})")
+        return True
+    except ImportError as e:
+        print(f"❌ Failed to load environment: {e}")
+        return False
+def test_directories():
+    """Test if required directories exist or can be created."""
+    print("\n📁 Testing directories...")
+    required_dirs = ["chroma_db", "documents", "query_results"]
+    for dir_name in required_dirs:
+        dir_path = project_dir / dir_name
+        try:
+            dir_path.mkdir(exist_ok=True)
+            print(f"✅ Directory '{dir_name}' ready")
+        except Exception as e:
+            print(f"❌ Failed to create directory '{dir_name}': {e}")
+            return False
+    return True
+def test_basic_functionality():
+    """Test basic RAG system functionality."""
+    print("\n⚡ Testing basic functionality...")
+    try:
+        from rag_system import DocumentIngestion, SEALionLLM
+        # Test document ingestion initialization
+        doc_ingestion = DocumentIngestion()
+        print("✅ DocumentIngestion initialized successfully")
+        # Test SEA-LION LLM
+        sea_lion = SEALionLLM()
+        print("✅ SEALionLLM initialized successfully")
+        # Test query classification
+        complex_query = "Show me universities in Malaysia for master's degree under 40000 RMB"
+        simple_query = "What does IELTS stand for?"
+        is_complex1 = sea_lion._is_complex_query(complex_query)
+        is_complex2 = sea_lion._is_complex_query(simple_query)
+        if is_complex1 and not is_complex2:
+            print("✅ Query classification working correctly")
+        else:
+            print(f"⚠️  Query classification may need adjustment (complex: {is_complex1}, simple: {is_complex2})")
+        # Test text splitter
+        text = "This is a test document. It contains multiple sentences. Each sentence should be processed correctly."
+        docs = doc_ingestion.text_splitter.split_text(text)
+        if docs:
+            print(f"✅ Text splitter working (created {len(docs)} chunks)")
+        else:
+            print("❌ Text splitter not working")
+            return False
+        return True
+    except Exception as e:
+        print(f"❌ Basic functionality test failed: {e}")
+        return False
+def main():
+    """Run all tests."""
+    print("🎓 PanSea University Search - System Test")
+    print("=" * 50)
+    tests = [
+        ("Import Test", test_imports),
+        ("Environment Test", test_environment),
+        ("Directory Test", test_directories),
+        ("Functionality Test", test_basic_functionality)
+    ]
+    all_passed = True
+    for test_name, test_func in tests:
+        try:
+            result = test_func()
+            if not result:
+                all_passed = False
+        except Exception as e:
+            print(f"❌ {test_name} failed with exception: {e}")
+            all_passed = False
+    print("\n" + "=" * 50)
+    if all_passed:
+        print("✅ All tests passed! Your system is ready to use.")
+        print("\nTo start the application, run:")
+        print("  ./start.sh")
+        print("\nOr manually with:")
+        print("  streamlit run app.py")
+    else:
+        print("❌ Some tests failed. Please fix the issues above before running the application.")
+        print("\nCommon solutions:")
+        print("1. Make sure you've activated the virtual environment:")
+        print("   source .venv/bin/activate")
+        print("2. Install dependencies:")
+        print("   pip install -r requirements.txt")
+        print("3. Create .env file with your API keys:")
+        print("   SEA_LION_API_KEY=your_sea_lion_api_key")
+        print("   OPENAI_API_KEY=your_openai_api_key")
+    return all_passed
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1)