Spaces:

Yozora721
/

pnp-chatbot-v1

Sleeping

App Files Files Community

FauziIsyrinApridal commited on May 27

Commit

ea1ba01

1 Parent(s): 69f9e8f

Initial commit without binary files

Browse files

Files changed (17) hide show

.env.example +3 -0
.gitattributes +0 -35
.gitignore +6 -0
README.md +60 -14
app.py +85 -0
app/__init__.py +1 -0
app/chat.py +101 -0
app/config.py +9 -0
app/data_loader.py +108 -0
app/db.py +4 -0
app/document_processor.py +44 -0
app/prompts.py +24 -0
rag_eval.ipynb +0 -0
requirements.txt +117 -0
tests/__init__.py +0 -0
tests/test_config.py +7 -0
tests/test_db.py +5 -0

.env.example ADDED Viewed

	@@ -0,0 +1,3 @@

+REPLICATE_API_TOKEN=
+SUPABASE_URL=
+SUPABASE_KEY=

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+/venv
+.env
+/app/__pycache__
+/tests/__pycache__
+/app/scrapping/*.py
+/vector_store_data

README.md CHANGED Viewed

@@ -1,14 +1,60 @@
----
-title: Pnp Chatbot V1
-emoji: 👀
-colorFrom: yellow
-colorTo: purple
-sdk: streamlit
-sdk_version: 1.45.0
-app_file: app.py
-pinned: false
-license: apache-2.0
-short_description: Chatbot for Politeknik Negeri Padang
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Pnp-Bot
+Pnp-Bot is a web-based chatbot application to answer questions about Politeknik Negeri Padang using LLM Model (Sahabat-AI) with Retrieval-Augmented Generation method
+(Deployed):
+- Chatbot App :
+- Admin Dashboard : <span style="color:yellow;">Limited Access</span>, contact me for more details.
+### Chatbot app
+#### Technologies
+- Python
+- Langchain
+- Streamlit
+- Sahabat-AI (LLM)
+#### Create virtual environment
+```bash
+  python -m venv venv
+```
+#### Activate virtual environment
+On windows
+```bash
+  .\venv\Scripts\activate
+```
+On linux
+```bash
+  source venv/Scripts/activate
+```
+#### Install dependencies
+```bash
+  pip install -r requirements.txt
+```
+#### Setup env
+Create env using env.example file
+```bash
+  cp .env.example .env
+```
+Fill the variable with your own API keys from https://replicate.com/ and https://huggingface.co/
+#### Running
+```bash
+  streamlit run app.py
+```

app.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import streamlit as st
+import os
+from dotenv import load_dotenv
+from langsmith import traceable
+from app.chat import initialize_session_state, display_chat_history
+from app.data_loader import get_data, load_docs
+from app.document_processor import process_documents, save_vector_store, load_vector_store
+from app.prompts import sahabat_prompt
+from langchain_community.llms import Replicate
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationalRetrievalChain
+from langchain_community.document_transformers import LongContextReorder
+load_dotenv()
+VECTOR_STORE_PATH = "vector_store_data"
+DATA_DIR = "data"
+@traceable(name="Create RAG Conversational Chain")
+def create_conversational_chain(vector_store):
+    llm = Replicate(
+        model="fauziisyrinapridal/sahabat-ai-v1:afb9fa89fe786362f619fd4fef34bd1f7a4a4da23073d8a6fbf54dcbe458f216",
+        model_kwargs={"temperature": 0.1, "top_p": 0.9, "max_new_tokens": 6000}
+    )
+    memory = ConversationBufferMemory(
+        memory_key="chat_history",
+        return_messages=True,
+        output_key='answer'
+    )
+    chain = ConversationalRetrievalChain.from_llm(
+        llm,
+        retriever=vector_store.as_retriever(search_kwargs={"k": 6}),
+        combine_docs_chain_kwargs={"prompt": sahabat_prompt},
+        return_source_documents=True,
+        memory=memory
+    )
+    return chain
+def reorder_embedding(docs):
+    reordering = LongContextReorder()
+    return reordering.transform_documents(docs)
+def get_latest_data_timestamp(folder):
+    latest_time = 0
+    for root, _, files in os.walk(folder):
+        for file in files:
+            path = os.path.join(root, file)
+            file_time = os.path.getmtime(path)
+            latest_time = max(latest_time, file_time)
+    return latest_time
+def vector_store_is_outdated():
+    if not os.path.exists(VECTOR_STORE_PATH):
+        return True
+    vector_store_time = os.path.getmtime(VECTOR_STORE_PATH)
+    data_time = get_latest_data_timestamp(DATA_DIR)
+    return data_time > vector_store_time
+@traceable(name="Main Chatbot RAG App")
+def main():
+    initialize_session_state()
+    get_data()
+    if len(st.session_state['history']) == 0:
+        if vector_store_is_outdated():
+            docs = load_docs()
+            reordered_docs = reorder_embedding(docs)
+            vector_store = process_documents(reordered_docs)
+            save_vector_store(vector_store)
+        else:
+            vector_store = load_vector_store()
+        st.session_state['vector_store'] = vector_store
+    if st.session_state['vector_store'] is not None:
+        chain = create_conversational_chain(st.session_state['vector_store'])
+        display_chat_history(chain)
+if __name__ == "__main__":
+    main()

app/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # app/__init__.py

app/chat.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import streamlit as st
+from streamlit_chat import message
+from streamlit_mic_recorder import speech_to_text
+import base64
+import gtts
+from io import BytesIO
+def initialize_session_state():
+    if 'history' not in st.session_state:
+        st.session_state['history'] = []
+    if 'generated' not in st.session_state:
+        st.session_state['generated'] = ["Halo! Saya bisa membantu anda menjawab pertanyaan seputar Politeknik Negeri Padang!"]
+    if 'past' not in st.session_state:
+        st.session_state['past'] = ["Hai! 👋"]
+    if 'data_len' not in st.session_state:
+        st.session_state['data_len'] = 0
+    if 'vector_store' not in st.session_state:
+        st.session_state['vector_store'] = None
+    if 'should_speak' not in st.session_state:
+        st.session_state['should_speak'] = True
+    if 'input_text' not in st.session_state:
+        st.session_state['input_text'] = ""
+def text_to_speech(text):
+    tts = gtts.gTTS(text, lang="id")
+    audio_bytes = BytesIO()
+    tts.write_to_fp(audio_bytes)
+    audio_bytes.seek(0)
+    audio_base64 = base64.b64encode(audio_bytes.read()).decode()
+    audio_player = f"""
+    <audio autoplay>
+        <source src="data:audio/mp3;base64,{audio_base64}" type="audio/mp3">
+    </audio>
+    """
+    return audio_player
+def conversation_chat(query, chain, history):
+    result = chain({"question": query, "chat_history": history})
+    history.append((query, result["answer"]))
+    return result["answer"]
+def display_chat_history(chain):
+    reply_container = st.container()
+    # Chat input section (at the bottom, always)
+    col1, col2, col3 = st.columns([7, 1, 1])
+    with col2:
+        # Toggle Text-to-Speech (TTS) using an icon instead of visible checkbox
+        should_speak = st.session_state.get('should_speak', True)
+        # Handle manual icon toggle (using button instead of checkbox)
+        icon_label = "🔊" if should_speak else "🔇"
+        if st.button(icon_label, key="toggle_tts", help="Aktifkan/Nonaktifkan Text-to-Speech", use_container_width=True):
+            st.session_state['should_speak'] = not should_speak
+    with col3:
+        # Mic input
+        stt_text = speech_to_text(
+            start_prompt="🎤",
+            stop_prompt="🛑 Stop",
+            language='id',
+            just_once=True,
+            key='stt_input',
+            use_container_width=True,
+        )
+    with col1:
+        # Use chat_input so it's pinned and integrated better
+        user_input_obj = st.chat_input(
+            "Masukkan pertanyaan atau Tekan tombol mic untuk berbicara!",
+            key="chat_input_field"
+        )
+    # Jika ada hasil dari STT, masukkan ke input dan rerun
+    if stt_text:
+        st.session_state.input_text = stt_text
+        st.rerun()
+    user_input = user_input_obj or st.session_state.get("input_text", "")
+    if user_input:
+        with st.spinner('Sedang membuat jawaban...'):
+            output = conversation_chat(user_input, chain, st.session_state['history'])
+        st.session_state['past'].append(user_input)
+        st.session_state['generated'].append(output)
+        st.session_state.input_text = ""  # Kosongkan input setelah kirim
+        if st.session_state['should_speak'] and output:
+            st.markdown(text_to_speech(output), unsafe_allow_html=True)
+    # Tampilkan riwayat chat
+    if st.session_state['generated']:
+        with reply_container:
+            for i in range(len(st.session_state['generated'])):
+                message(st.session_state["past"][i], is_user=True, key=str(i) + '_user', avatar_style="no-avatar")
+                message(st.session_state["generated"][i], key=str(i), avatar_style="no-avatar")

app/config.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+class Config:
+    SUPABASE_URL = os.getenv('SUPABASE_URL')
+    SUPABASE_KEY = os.getenv('SUPABASE_KEY')
+    REPLICATE_API_TOKEN = os.getenv('REPLICATE_API_TOKEN')

app/data_loader.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+from app.db import supabase
+from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
+def list_all_files(bucket_name, limit_per_page=1000):
+    all_files = []
+    offset = 0
+    while True:
+        try:
+            files = supabase.storage.from_(bucket_name).list("", {
+                "limit": limit_per_page,
+                "offset": offset
+            })
+            if not files:
+                break
+            all_files.extend(files)
+            offset += limit_per_page
+        except Exception as e:
+            print(f"Error fetching files with offset {offset}: {e}")
+            break
+    return all_files
+def get_data():
+    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+    data_dir = os.path.join(BASE_DIR, 'data')
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+    try:
+        local_files = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))]
+    except Exception as e:
+        print(f"Error accessing local files: {e}")
+        return
+    try:
+        remote_files_info = list_all_files("pnp-bot-storage")
+    except Exception as e:
+        print(f"Error fetching files from Supabase: {e}")
+        return
+    remote_files = [f["name"] for f in remote_files_info]
+    # Sinkronisasi: hapus file yang tidak ada di storage
+    file_to_delete = list(set(local_files) - set(remote_files))
+    file_to_download = list(set(remote_files) - set(local_files))
+    for filename in file_to_delete:
+        try:
+            os.remove(os.path.join(data_dir, filename))
+            print(f"Removed: {filename}")
+        except Exception as e:
+            print(f"Error removing {filename}: {e}")
+    for filename in file_to_download:
+        try:
+            file_path = os.path.join(data_dir, filename)
+            res = supabase.storage.from_("pnp-bot-storage").download(filename)
+            with open(file_path, "wb") as f:
+                f.write(res)
+            print(f"Downloaded: {filename}")
+        except Exception as e:
+            print(f"Error downloading {filename}: {e}")
+def load_docs():
+    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+    data_dir = os.path.join(BASE_DIR, 'data')
+    if not os.path.exists(data_dir):
+        print(f"Directory not found: {data_dir}")
+        os.makedirs(data_dir)
+        print(f"Created directory: {data_dir}")
+        return []
+    documents = []
+    try:
+        files = os.listdir(data_dir)
+    except PermissionError:
+        print(f"Permission denied: {data_dir}")
+        return []
+    for file in files:
+        file_path = os.path.join(data_dir, file)
+        if file.endswith(".pdf"):
+            try:
+                loader = PyPDFLoader(file_path)
+                documents.extend(loader.load())
+            except Exception as e:
+                print(f"Error loading PDF file {file}: {e}")
+        elif file.endswith('.docx') or file.endswith('.doc'):
+            try:
+                loader = Docx2txtLoader(file_path)
+                documents.extend(loader.load())
+            except Exception as e:
+                print(f"Error loading DOCX/DOC file {file}: {e}")
+        elif file.endswith('.txt'):
+            try:
+                loader = TextLoader(file_path, encoding='utf-8', autodetect_encoding=True)
+                documents.extend(loader.load())
+            except Exception as e:
+                print(f"Error loading TXT file {file}: {e}")
+    return documents

app/db.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from supabase import create_client
+from app.config import Config
+supabase = create_client(Config.SUPABASE_URL, Config.SUPABASE_KEY)

app/document_processor.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+import os
+VECTOR_STORE_PATH = "vector_store_data"
+def save_vector_store(vector_store):
+    """Simpan vector store ke file."""
+    vector_store.save_local(VECTOR_STORE_PATH)
+    print(f"Vector store saved to {VECTOR_STORE_PATH}")
+def load_vector_store():
+    """Muat vector store dari file, atau return None kalau file tidak ada."""
+    if os.path.exists(VECTOR_STORE_PATH):
+        embeddings = HuggingFaceEmbeddings(
+            model_name="LazarusNLP/all-indo-e5-small-v4",
+            model_kwargs={"device": "cpu"},
+            encode_kwargs={"normalize_embeddings": True}
+        )
+        vector_store = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
+        print(f"Vector store loaded from {VECTOR_STORE_PATH}")
+        return vector_store
+    else:
+        print("Vector store file not found.")
+        return None
+def process_documents(docs):
+    embeddings = HuggingFaceEmbeddings(
+        model_name="LazarusNLP/all-indo-e5-small-v4",
+        model_kwargs={"device": "cpu"},
+        encode_kwargs={"normalize_embeddings": True}
+    )
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1500,
+        chunk_overlap=300
+    )
+    text_chunks = text_splitter.split_documents(docs)
+    vector_store = FAISS.from_documents(text_chunks, embeddings)
+    return vector_store

app/prompts.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from langchain.prompts import PromptTemplate
+prompt = """<|begin_of_text|>
+<|start_header_id|>system<|end_header_id|>
+Kamu adalah asisten dari Politeknik Negeri Padang.
+Tugasmu adalah menjawab pertanyaan berdasarkan konteks dokumen yang diberikan oleh pengguna.
+Jika pengguna bertanya di luar topik dokumen, jangan tanggapi.
+Jika konteks yang diberikan tidak cukup untuk menjawab pertanyaan, katakan bahwa kamu tidak memiliki jawabannya.
+Jawablah menggunakan bahasa yang sama dengan yang digunakan pengguna seperti Bahasa Indonesia, Bahasa Jawa, Bahasa Minang, Bahasa Sunda, atau Bahasa Inggris.
+Berikan jawaban jelas dan terstruktur
+<|eot_id|>
+<|start_header_id|>user<|end_header_id|>
+Jawablah pertanyaan pengguna berdasarkan konteks berikut:
+Konteks: {context}
+Pertanyaan: {question}
+<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>
+"""
+sahabat_prompt = PromptTemplate(
+    template=prompt,
+    input_variables=["context", "question"]
+)

rag_eval.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,117 @@

+aiohttp==3.9.5
+aiosignal==1.3.1
+altair==4.0.0
+anyio==3.7.1
+attrs==23.2.0
+beautifulsoup4==4.12.3
+blinker==1.7.0
+bs4==0.0.2
+cachetools==5.3.3
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+dataclasses-json==0.5.14
+distro==1.9.0
+entrypoints==0.4
+faiss-cpu==1.7.4
+filelock==3.13.4
+frozenlist==1.4.1
+fsspec==2024.3.1
+gitdb==4.0.11
+GitPython==3.1.43
+greenlet==3.0.3
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.14.1
+idna==3.7
+importlib_metadata==7.1.0
+InstructorEmbedding==1.0.1
+Jinja2==3.1.3
+joblib==1.4.0
+jsonpatch==1.33
+jsonpointer==2.4
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+langchain==0.1.17
+langchain-community
+langchain-core==0.1.52
+langchain-text-splitters==0.0.1
+langsmith==0.1.55
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+marshmallow==3.21.1
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.5
+mypy-extensions==1.0.0
+networkx==3.3
+nltk==3.8.1
+numexpr==2.10.0
+numpy==1.26.4
+openai==1.26.0
+openapi-schema-pydantic==1.2.4
+orjson==3.10.3
+packaging==23.2
+pandas==2.2.2
+pillow==10.3.0
+protobuf==3.20.3
+pyarrow==16.0.0
+pydantic==1.10.15
+pydeck==0.8.1b0
+Pygments==2.17.2
+Pympler==1.0.1
+pypdf==4.2.0
+PyPDF2==3.0.1
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.0
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.34.0
+regex==2024.4.16
+replicate==0.25.2
+requests==2.31.0
+rich==13.7.1
+rpds-py==0.18.0
+safetensors==0.4.3
+scikit-learn==1.4.2
+scipy==1.13.0
+semver==3.0.2
+sentence-transformers==2.2.2
+sentencepiece==0.2.0
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+soupsieve==2.5
+SQLAlchemy==2.0.29
+streamlit==1.34.0
+streamlit-chat==0.1.1
+sympy==1.12
+tenacity==8.2.3
+threadpoolctl==3.4.0
+tiktoken==0.4.0
+tokenizers==0.13.3
+toml==0.10.2
+toolz==0.12.1
+torch==2.2.2
+torchvision==0.17.2
+tornado==6.4
+tqdm==4.66.2
+transformers==4.45.0
+typing-inspect==0.9.0
+typing_extensions==4.11.0
+tzdata==2024.1
+tzlocal==5.2
+urllib3==2.2.1
+validators==0.28.1
+watchdog==4.0.0
+yarl==1.9.4
+zipp==3.18.1
+supabase
+docx2txt
+soundfile
+SpeechRecognition
+chardet
+streamlit_mic_recorder
+gtts

tests/__init__.py ADDED Viewed

File without changes

tests/test_config.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import os
+from app.config import Config
+def test_config_load():
+    assert Config.SUPABASE_URL is not None
+    assert Config.SUPABASE_KEY is not None
+    assert Config.REPLICATE_API_TOKEN is not None

tests/test_db.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from app.db import supabase
+def test_supabase_connection():
+    #check conn
+    assert supabase is not None