engrphoenix's picture
Update app.py
6cbadc6 verified
raw
history blame
4.24 kB
import streamlit as st
from PyPDF2 import PdfReader
from transformers import pipeline, AutoTokenizer, AutoModel
from sklearn.feature_extraction.text import TfidfVectorizer
import faiss
import numpy as np
# Load the Hugging Face model for text generation
@st.cache_resource
def load_text_generator():
return pipeline("text2text-generation", model="google/flan-t5-base")
# Load the Hugging Face model for embeddings
@st.cache_resource
def load_embedding_model():
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
return tokenizer, model
text_generator = load_text_generator()
embedding_tokenizer, embedding_model = load_embedding_model()
# Function to extract text from PDF
def extract_pdf_content(pdf_file):
reader = PdfReader(pdf_file)
content = ""
for page in reader.pages:
content += page.extract_text()
return content
# Function to split content into chunks
def chunk_text(text, chunk_size=500):
words = text.split()
return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
# Function to compute embeddings
def compute_embeddings(text_chunks):
embeddings = []
for chunk in text_chunks:
inputs = embedding_tokenizer(chunk, return_tensors="pt", truncation=True, padding=True)
outputs = embedding_model(**inputs)
embeddings.append(outputs.pooler_output.detach().numpy()[0])
return np.array(embeddings)
# Function to build FAISS index
def build_faiss_index(embeddings):
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension) # L2 distance for similarity
index.add(embeddings)
return index
# Function to search in FAISS index
def search_faiss_index(index, query_embedding, text_chunks, top_k=3):
distances, indices = index.search(query_embedding, top_k)
return [(text_chunks[idx], distances[0][i]) for i, idx in enumerate(indices[0])]
# Function to generate structured content
def generate_professional_content(topic):
prompt = f"Explain '{topic}' in bullet points, highlighting key concepts, examples, and applications."
response = text_generator(prompt, max_length=300, num_return_sequences=1)
return response[0]['generated_text']
# Function to compute query embedding
def compute_query_embedding(query):
inputs = embedding_tokenizer(query, return_tensors="pt", truncation=True, padding=True)
outputs = embedding_model(**inputs)
return outputs.pooler_output.detach().numpy()
# Streamlit app
st.title("Generative AI for Electrical Engineering Education with FAISS")
st.sidebar.header("AI-Based Tutor with Vector Search")
# File upload section
uploaded_file = st.sidebar.file_uploader("Upload Study Material (PDF)", type=["pdf"])
topic = st.sidebar.text_input("Enter a topic (e.g., Newton's Third Law)")
if uploaded_file:
# Extract and process file content
content = extract_pdf_content(uploaded_file)
st.sidebar.success(f"{uploaded_file.name} uploaded successfully!")
# Chunk and compute embeddings
chunks = chunk_text(content)
embeddings = compute_embeddings(chunks)
# Build FAISS index
index = build_faiss_index(embeddings)
st.write("**File Processed and Indexed for Search**")
st.write(f"Total chunks created: {len(chunks)}")
# Generate study material
if st.button("Generate Study Material"):
if topic:
st.header(f"Study Material: {topic}")
# Compute query embedding
query_embedding = compute_query_embedding(topic)
# Search FAISS index
if uploaded_file:
results = search_faiss_index(index, query_embedding, chunks, top_k=3)
st.write("**Relevant Content from Uploaded File:**")
for result, distance in results:
st.write(f"- {result} (Similarity: {distance:.2f})")
else:
st.warning("No file uploaded. Generating AI-based content instead.")
# Generate AI content
ai_content = generate_professional_content(topic)
st.write("**AI-Generated Content:**")
st.write(ai_content)
else:
st.warning("Please enter a topic!")