Spaces:
Sleeping
Sleeping
File size: 5,628 Bytes
5bb71dd 051a2c5 5bb71dd 051a2c5 5bb71dd cfdb962 5bb71dd e62b6a6 5bb71dd 42c8348 051a2c5 42c8348 5bb71dd 051a2c5 5bb71dd e62b6a6 cfdb962 5bb71dd cfdb962 e62b6a6 5bb71dd e62b6a6 90a61b2 e62b6a6 90a61b2 e62b6a6 90a61b2 e62b6a6 5bb71dd e62b6a6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import streamlit as st
import faiss
import numpy as np
from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer
from sentence_transformers import SentenceTransformer
from docx import Document
import PyPDF2 # Use PyPDF2 instead of PyMuPDF
import requests
from bs4 import BeautifulSoup
from langdetect import detect, LangDetectException
# Initialize models and pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased")
embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
# FAISS index setup (in-memory)
dimension = 512 # Size of the embeddings
index = faiss.IndexFlatL2(dimension)
documents = []
# Initialize translation model for on-the-fly translation
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
def translate_text(text, src_lang, tgt_lang):
""" Translate text using the M2M100 model. """
tokenizer.src_lang = src_lang
encoded = tokenizer(text, return_tensors="pt")
generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang))
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
# Sidebar for navigation
st.sidebar.title("Navigation")
page = st.sidebar.radio("Go to", ["Upload Knowledge", "Q&A"])
# Page 1: Knowledge Upload
if page == "Upload Knowledge":
st.title("Upload Knowledge Base")
uploaded_files = st.file_uploader("Upload your files (DOCX, PDF)", type=["pdf", "docx"], accept_multiple_files=True)
url = st.text_input("Or enter a website URL to scrape")
if uploaded_files or url:
st.write("Processing your data...")
texts = []
# Process uploaded files
for file in uploaded_files:
try:
if file.type == "application/pdf":
pdf_reader = PyPDF2.PdfReader(file) # Use PyPDF2 for PDF reading
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
doc = Document(file)
text = " ".join([para.text for para in doc.paragraphs])
else:
st.error(f"Unsupported file type: {file.type}")
continue
# Language detection
try:
detected_lang = detect(text)
st.write(f"Detected language: {detected_lang}")
except LangDetectException:
st.error("Could not detect the language of the text.")
continue
# Generate embeddings
embedding = embedding_model.encode([text])[0]
# Add the embedding to FAISS index
index.add(np.array([embedding], dtype=np.float32))
documents.append(text)
texts.append(text)
except Exception as e:
st.error(f"Error processing file: {e}")
# Process URL
if url:
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text()
try:
detected_lang = detect(text)
st.write(f"Detected language: {detected_lang}")
except LangDetectException:
st.error("Could not detect the language of the webpage.")
url = None # Set URL to None or skip to prevent further processing
if url: # Continue only if URL processing is valid
# Generate embedding
embedding = embedding_model.encode([text])[0]
# Add the embedding to FAISS index
index.add(np.array([embedding], dtype=np.float32))
documents.append(text)
texts.append(text)
except Exception as e:
st.error(f"Error processing URL: {e}")
st.write("Data processed and added to knowledge base!")
# Provide a summary of the uploaded content
for i, text in enumerate(texts):
st.write(f"Summary of Document {i+1}:")
st.write(text[:500] + "...") # Display first 500 characters as a summary
# Page 2: Q&A Interface
elif page == "Q&A":
st.title("Ask the Knowledge Base")
user_query = st.text_input("Enter your query:")
if user_query:
try:
detected_query_lang = detect(user_query)
# Translate the query if it's in a different language than the knowledge base
if detected_query_lang != "en":
st.write(f"Translating query from {detected_query_lang} to English")
user_query = translate_text(user_query, detected_query_lang, "en")
query_embedding = embedding_model.encode([user_query])
D, I = index.search(np.array(query_embedding, dtype=np.float32), k=5) # Retrieve top 5 documents
context = " ".join([documents[i] for i in I[0]])
# Pass translated query and context to the QA pipeline
result = qa_pipeline(question=user_query, context=context)
st.write(f"Answer: {result['answer']}")
except LangDetectException:
st.error("Could not detect the language of the query.")
except Exception as e:
st.error(f"Error during Q&A processing: {e}")
|