Spaces:
Running
Running
import streamlit as st | |
from transformers import pipeline | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import numpy as np | |
import re | |
import pkg_resources | |
from symspellpy import SymSpell, Verbosity | |
# Initialize SymSpell | |
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) | |
# Load a dictionary | |
dictionary_path = pkg_resources.resource_filename( | |
"symspellpy", "frequency_dictionary_en_82_765.txt" | |
) | |
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) | |
sym_spell.create_dictionary_entry("cgpa", 100000) | |
def preprocess_text(text): | |
# Use regex to split text into tokens, preserving numeric/alphanumeric data | |
tokens = re.findall(r'\w+|\d+\w*|\S+', text) | |
return tokens | |
# Function to correct spelling while preserving numeric data | |
def correct_spelling(text): | |
# Split text into tokens | |
tokens = preprocess_text(text) | |
corrected_tokens = [] | |
# print(tokens) | |
for token in tokens: | |
# If the token is numeric or alphanumeric, preserve it | |
if token.isdigit() or re.match(r'\d+\w*', token) or re.match(r'[.,]', token):# or re.match(r'\S+', token): | |
corrected_tokens.append(token) | |
else: | |
# Otherwise, correct the token using SymSpell | |
suggestions = sym_spell.lookup(token, max_edit_distance=2, verbosity=1) | |
# print(suggestions) | |
if suggestions: | |
corrected_token = suggestions[0].term # Use the best suggestion | |
else: | |
corrected_token = token # If no suggestion, keep the original token | |
corrected_tokens.append(corrected_token) | |
# Join the corrected tokens into a sentence | |
return " ".join(corrected_tokens) | |
# Sample knowledge base (documents) | |
documents = [ | |
""" | |
Biodata or about ginni as name : GINNI GARG, email : [email protected], phone : +91-8295954475, Date of Birth - 1st January 1998. | |
""", | |
""" | |
Ginni completed his Graduation B.Tech in Computer Engineering from National Institute of Technology, Kurukshetra in between 2016 -2020 with cgpa 9.65 | |
""", | |
""" | |
Father name of ginni is DharamPal Garg. He is Director JSS Sirsa. Mother Name is Rajni Garg, She is Housewife. Wife name of ginni is Ekta, She is Bank Manager. | |
""", | |
""" | |
Ginni hobbies are reading books, Badminton, Yoga, Running, Walking, Exercies, GYM etc. | |
""", | |
""" | |
Ginni Favourite Books are Atomic Habits, IKigai, Biography of Swami Viveknand, Jeevan Amrit by OSHO etc. | |
""", | |
""" | |
Ginni Domain expertise is Software Engineering, specifically Backed Engineering. | |
""", | |
""" | |
ginni completed Schooling both 10th (2012-2013) with cgpa 10, and 12th (2014-2015) with 91% from D.A.V. Public School, Kalanwali. | |
""", | |
""" | |
all companies where ginni worked/experience as follow CDOT, SirionLabs, Otipy and Arcesium. | |
""", | |
""" | |
Gate qualified in 2020 with All India Rank 2562, gate score 562 and GATE Marks as 46.67/100. JEE Main Qualified in 2016 with All India Rank 8123, JEE Marks as 231/360 and JEE Percentile 99.3% | |
""", | |
""" | |
social media of ginni as follow - 'linkedin : www.linkedin.com/in/ginni-garg', 'github : https://github.com/GinniIndia' | |
""", | |
""" | |
All academic Achievements of ginni: | |
1.Received Award of Academic Excellence for Department Topper in First year. | |
2.Received Award of Academic Excellence for Securing Third Rank among all Departments in First Year. | |
3. Member of Institution Innovation Council under the ageis of MHRD’s Innovation Cell established at NIT, Kurukshetra for academic year 2018-2019. | |
4. Department Rank 4 (Computer Engineering Graduation) and University Rank 5. | |
5. In National Level Science Talent Search Examination and secured 252 rank at National Level. | |
""", | |
""" | |
List of all Publications or research papers of ginni as : | |
1. Ginni Garg and Ritu Garg. “Brain Tumor Detection and Classification using Hybrid Ensemble Classifier”. | |
International Journal of Healthcare Information Systems and Informatics (IJHISI), IGI Global, Clarivate Analytics | |
indexed, scopus indexed. | |
arxiv Link: https://arxiv.org/abs/2101.00216 | |
2. Ginni Garg and Mantosh Biswas. “Improved Neural Network Based Plant Disease Identification” in First | |
International Conference on Advanced Communication & Computational Technology (ICACCT) 2019, Scopus | |
Index, LNEE Format. | |
arxiv Link: https://arxiv.org/abs/2101.00215 | |
3. Ginni Garg and Ritu Garg. “A Hybrid MLP-SVM based classification using spatial-spectral features on Hyper- | |
spectral Images”. International Conference Futuristic Trends in Networks and Computing Technologies, FTNCT- | |
2020 Approved by CCIS, Springer (Indexed by Scopus and DBLP), Southern Federal University, Russia. | |
arxiv Link: https://arxiv.org/abs/2101.00214 | |
""" | |
] | |
# Step 1: Embed documents using a transformer model | |
model = SentenceTransformer("all-MiniLM-L6-v2") | |
doc_embeddings = model.encode(documents) | |
# Step 2: Create FAISS index for efficient retrieval | |
index = faiss.IndexFlatL2(doc_embeddings.shape[1]) | |
index.add(np.array(doc_embeddings).astype("float32")) | |
d = doc_embeddings.shape[1] | |
# | |
# nlist = 5 # Number of clusters (adjust based on data size) | |
# quantizer = faiss.IndexFlatL2(d) # L2 distance metric for clustering | |
# ivf_index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2) | |
# | |
# # Train IVF index (mandatory step before adding vectors) | |
# ivf_index.train(doc_embeddings) | |
# ivf_index.add(doc_embeddings) | |
# | |
hnsw_index = faiss.IndexHNSWFlat(d, 32) # 32 - recommended parameter for optimal search | |
hnsw_index.add(doc_embeddings) | |
# # Step 3: Define the RAG pipeline | |
def rag_qa(question): | |
question = correct_spelling(question) | |
print(f'correct_question : {question}') | |
question_embedding = model.encode([question]) | |
distances, retrieved_indices = hnsw_index.search(np.array(question_embedding).astype("float32"), k=1) | |
retrieved_doc = documents[retrieved_indices[0][0]] | |
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-large") | |
prompt = f"Context: {retrieved_doc}\n\nQ: {question}\nA: If the answer is not clear from the context, respond with 'I don't know'" | |
response = qa_pipeline(prompt, max_length=1000) | |
return response[0]['generated_text'] | |
# Step 4: Streamlit UI Implementation | |
st.title("🧠 Ask anything about Ginni !") | |
question = st.text_input("Ask your question:") | |
if st.button("Get Answer"): | |
if question.strip(): | |
answer = rag_qa(question) | |
st.success(f"**Answer:** {answer}") | |
else: | |
st.warning("Please enter a valid question.") |