File size: 5,628 Bytes
5bb71dd
051a2c5
 
5bb71dd
051a2c5
5bb71dd
cfdb962
5bb71dd
 
e62b6a6
5bb71dd
 
 
 
 
42c8348
051a2c5
 
42c8348
5bb71dd
 
 
 
 
 
051a2c5
5bb71dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e62b6a6
 
cfdb962
5bb71dd
cfdb962
 
e62b6a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5bb71dd
 
 
e62b6a6
 
 
 
 
 
 
 
 
 
90a61b2
e62b6a6
90a61b2
 
 
e62b6a6
90a61b2
 
 
 
e62b6a6
 
5bb71dd
 
 
 
 
 
 
 
 
 
 
 
 
 
e62b6a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import streamlit as st
import faiss
import numpy as np
from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer
from sentence_transformers import SentenceTransformer
from docx import Document
import PyPDF2  # Use PyPDF2 instead of PyMuPDF
import requests
from bs4 import BeautifulSoup
from langdetect import detect, LangDetectException

# Initialize models and pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased")
embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# FAISS index setup (in-memory)
dimension = 512  # Size of the embeddings
index = faiss.IndexFlatL2(dimension)
documents = []

# Initialize translation model for on-the-fly translation
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")

def translate_text(text, src_lang, tgt_lang):
    """ Translate text using the M2M100 model. """
    tokenizer.src_lang = src_lang
    encoded = tokenizer(text, return_tensors="pt")
    generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang))
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

# Sidebar for navigation
st.sidebar.title("Navigation")
page = st.sidebar.radio("Go to", ["Upload Knowledge", "Q&A"])

# Page 1: Knowledge Upload
if page == "Upload Knowledge":
    st.title("Upload Knowledge Base")
    uploaded_files = st.file_uploader("Upload your files (DOCX, PDF)", type=["pdf", "docx"], accept_multiple_files=True)
    url = st.text_input("Or enter a website URL to scrape")

    if uploaded_files or url:
        st.write("Processing your data...")
        texts = []

        # Process uploaded files
        for file in uploaded_files:
            try:
                if file.type == "application/pdf":
                    pdf_reader = PyPDF2.PdfReader(file)  # Use PyPDF2 for PDF reading
                    text = ""
                    for page in pdf_reader.pages:
                        text += page.extract_text()
                elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
                    doc = Document(file)
                    text = " ".join([para.text for para in doc.paragraphs])
                else:
                    st.error(f"Unsupported file type: {file.type}")
                    continue

                # Language detection
                try:
                    detected_lang = detect(text)
                    st.write(f"Detected language: {detected_lang}")
                except LangDetectException:
                    st.error("Could not detect the language of the text.")
                    continue

                # Generate embeddings
                embedding = embedding_model.encode([text])[0]

                # Add the embedding to FAISS index
                index.add(np.array([embedding], dtype=np.float32))
                documents.append(text)
                texts.append(text)
            except Exception as e:
                st.error(f"Error processing file: {e}")

        # Process URL
        if url:
            try:
                response = requests.get(url)
                soup = BeautifulSoup(response.text, 'html.parser')
                text = soup.get_text()

                try:
                    detected_lang = detect(text)
                    st.write(f"Detected language: {detected_lang}")
                except LangDetectException:
                    st.error("Could not detect the language of the webpage.")
                    url = None  # Set URL to None or skip to prevent further processing

                if url:  # Continue only if URL processing is valid
                    # Generate embedding
                    embedding = embedding_model.encode([text])[0]

                    # Add the embedding to FAISS index
                    index.add(np.array([embedding], dtype=np.float32))
                    documents.append(text)
                    texts.append(text)
            except Exception as e:
                st.error(f"Error processing URL: {e}")

        st.write("Data processed and added to knowledge base!")

        # Provide a summary of the uploaded content
        for i, text in enumerate(texts):
            st.write(f"Summary of Document {i+1}:")
            st.write(text[:500] + "...")  # Display first 500 characters as a summary

# Page 2: Q&A Interface
elif page == "Q&A":
    st.title("Ask the Knowledge Base")
    user_query = st.text_input("Enter your query:")

    if user_query:
        try:
            detected_query_lang = detect(user_query)

            # Translate the query if it's in a different language than the knowledge base
            if detected_query_lang != "en":
                st.write(f"Translating query from {detected_query_lang} to English")
                user_query = translate_text(user_query, detected_query_lang, "en")

            query_embedding = embedding_model.encode([user_query])
            D, I = index.search(np.array(query_embedding, dtype=np.float32), k=5)  # Retrieve top 5 documents
            context = " ".join([documents[i] for i in I[0]])

            # Pass translated query and context to the QA pipeline
            result = qa_pipeline(question=user_query, context=context)
            st.write(f"Answer: {result['answer']}")
        except LangDetectException:
            st.error("Could not detect the language of the query.")
        except Exception as e:
            st.error(f"Error during Q&A processing: {e}")