Upload Hindi embeddings model and all associated files
Browse files- hindi-rag-system.py +117 -137
- hindi-rag-system.py.amltmp +117 -137
hindi-rag-system.py
CHANGED
@@ -744,35 +744,71 @@ def load_llama_model(model_name="unsloth/Llama-3.2-1B-Instruct", device="cuda"):
|
|
744 |
|
745 |
return model, tokenizer
|
746 |
|
747 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
748 |
"""
|
749 |
-
Set up
|
750 |
"""
|
751 |
# Create retriever
|
752 |
retriever = vector_store.as_retriever(
|
753 |
search_type="similarity",
|
754 |
-
search_kwargs={"k":
|
755 |
)
|
756 |
|
757 |
-
# Create a function to generate answers
|
758 |
-
def
|
759 |
-
#
|
760 |
-
|
761 |
-
docs = retriever.invoke(query)
|
762 |
-
except:
|
763 |
-
# Fallback to older method if invoke isn't available
|
764 |
-
docs = retriever.get_relevant_documents(query)
|
765 |
|
766 |
-
#
|
767 |
-
|
768 |
|
769 |
-
# Create prompt
|
770 |
prompt = f"""
|
771 |
आपको निम्नलिखित संदर्भ से जानकारी के आधार पर एक प्रश्न का उत्तर देना है।
|
772 |
-
यदि आप उत्तर नहीं जानते हैं, तो बस "मुझे नहीं पता" कहें।
|
773 |
|
774 |
संदर्भ:
|
775 |
-
{
|
776 |
|
777 |
प्रश्न: {query}
|
778 |
|
@@ -797,7 +833,7 @@ def setup_qa_system(model, tokenizer, vector_store):
|
|
797 |
do_sample=True
|
798 |
)
|
799 |
except Exception as e:
|
800 |
-
return f"Error generating response: {str(e)}"
|
801 |
|
802 |
# Decode the generated text
|
803 |
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
@@ -805,9 +841,9 @@ def setup_qa_system(model, tokenizer, vector_store):
|
|
805 |
# Extract just the answer part (after the prompt)
|
806 |
answer = full_response.split("उत्तर:")[-1].strip()
|
807 |
|
808 |
-
return answer
|
809 |
|
810 |
-
return
|
811 |
|
812 |
# Main RAG functions
|
813 |
def index_text_files(model, tokenizer, data_dir, output_dir, device="cuda", chunk_size=500):
|
@@ -858,54 +894,10 @@ def query_text_corpus(model, tokenizer, vector_store_path, query, k=6, device="c
|
|
858 |
# Perform similarity search
|
859 |
results = perform_similarity_search(vector_store, query, k=k)
|
860 |
|
861 |
-
|
862 |
-
processed_results = []
|
863 |
-
seen_chunks = set()
|
864 |
-
|
865 |
-
for doc, score in results:
|
866 |
-
chunk_id = doc.metadata["chunk_id"]
|
867 |
-
source = doc.metadata["source"]
|
868 |
-
|
869 |
-
# Skip if we've already included this chunk
|
870 |
-
if (source, chunk_id) in seen_chunks:
|
871 |
-
continue
|
872 |
-
|
873 |
-
seen_chunks.add((source, chunk_id))
|
874 |
-
|
875 |
-
# Try to find adjacent chunks and combine them
|
876 |
-
combined_content = doc.page_content
|
877 |
-
|
878 |
-
# Look for adjacent chunks in results (both previous and next)
|
879 |
-
for adj_id in [chunk_id-1, chunk_id+1]:
|
880 |
-
for other_doc, _ in results:
|
881 |
-
if (other_doc.metadata["source"] == source and
|
882 |
-
other_doc.metadata["chunk_id"] == adj_id and
|
883 |
-
(source, adj_id) not in seen_chunks):
|
884 |
-
|
885 |
-
# Add the adjacent chunk content
|
886 |
-
if adj_id < chunk_id: # Previous chunk
|
887 |
-
combined_content = other_doc.page_content + " " + combined_content
|
888 |
-
else: # Next chunk
|
889 |
-
combined_content = combined_content + " " + other_doc.page_content
|
890 |
-
|
891 |
-
seen_chunks.add((source, adj_id))
|
892 |
-
|
893 |
-
# Create a new document with combined content
|
894 |
-
combined_doc = Document(
|
895 |
-
page_content=combined_content,
|
896 |
-
metadata={
|
897 |
-
"source": source,
|
898 |
-
"chunk_id": chunk_id,
|
899 |
-
"is_combined": True if combined_content != doc.page_content else False
|
900 |
-
}
|
901 |
-
)
|
902 |
-
|
903 |
-
processed_results.append((combined_doc, score))
|
904 |
-
|
905 |
-
return processed_results, vector_store
|
906 |
|
907 |
def main():
|
908 |
-
parser = argparse.ArgumentParser(description="Hindi RAG System with
|
909 |
parser.add_argument("--model_dir", type=str, default="/home/ubuntu/output/hindi-embeddings-custom-tokenizer/final",
|
910 |
help="Directory containing the model and tokenizer")
|
911 |
parser.add_argument("--tokenizer_dir", type=str, default="/home/ubuntu/hindi_tokenizer",
|
@@ -928,10 +920,12 @@ def main():
|
|
928 |
help="Run in interactive mode for querying")
|
929 |
parser.add_argument("--reindex", action="store_true",
|
930 |
help="Force reindexing even if index exists")
|
931 |
-
parser.add_argument("--qa", action="store_true",
|
932 |
-
help="Use LLM for question answering instead of just retrieval")
|
933 |
parser.add_argument("--llm_name", type=str, default="unsloth/Llama-3.2-1B-Instruct",
|
934 |
help="HuggingFace model name for the LLM")
|
|
|
|
|
|
|
|
|
935 |
args = parser.parse_args()
|
936 |
|
937 |
# Load embedding model and tokenizer
|
@@ -943,20 +937,15 @@ def main():
|
|
943 |
# Create vector store path
|
944 |
vector_store_path = os.path.join(args.output_dir, "faiss_index")
|
945 |
|
946 |
-
# Load LLM
|
947 |
-
|
948 |
-
|
949 |
-
|
950 |
-
|
951 |
-
|
952 |
-
|
953 |
-
|
954 |
-
|
955 |
-
print("LLM loaded successfully for QA")
|
956 |
-
except Exception as e:
|
957 |
-
print(f"Error loading LLM: {e}")
|
958 |
-
print("Falling back to retrieval-only mode")
|
959 |
-
args.qa = False
|
960 |
|
961 |
if args.index or args.reindex:
|
962 |
# Index text files
|
@@ -964,47 +953,43 @@ def main():
|
|
964 |
embed_model, embed_tokenizer, args.data_dir, args.output_dir, args.device, args.chunk_size
|
965 |
)
|
966 |
print(f"Indexing complete. Vector store saved to {vector_store_path}")
|
967 |
-
|
968 |
-
|
969 |
-
|
970 |
-
|
|
|
|
|
|
|
971 |
|
972 |
if args.query:
|
973 |
-
#
|
974 |
-
|
975 |
-
embed_model, embed_tokenizer, vector_store_path, args.query, args.top_k, args.device
|
976 |
-
)
|
977 |
|
978 |
-
#
|
979 |
-
|
980 |
-
|
981 |
-
|
982 |
-
|
983 |
|
984 |
-
|
985 |
-
|
986 |
-
|
987 |
-
|
988 |
-
|
989 |
-
|
990 |
-
|
991 |
-
|
|
|
|
|
|
|
|
|
992 |
|
993 |
-
|
994 |
-
|
995 |
-
try:
|
996 |
-
answer = qa_generator(args.query)
|
997 |
-
print("\nLLM Answer:")
|
998 |
-
print(answer)
|
999 |
-
except Exception as e:
|
1000 |
-
print(f"Error generating answer: {e}")
|
1001 |
|
1002 |
if args.interactive:
|
1003 |
print("\nInteractive mode. Enter queries (or type 'quit' to exit).")
|
1004 |
|
1005 |
-
# For the first query, load vector store
|
1006 |
-
vector_store = None
|
1007 |
-
|
1008 |
while True:
|
1009 |
print("\nEnter query:")
|
1010 |
query = input()
|
@@ -1015,33 +1000,28 @@ def main():
|
|
1015 |
if query.lower() == 'quit':
|
1016 |
break
|
1017 |
|
1018 |
-
#
|
1019 |
-
|
1020 |
-
|
1021 |
-
|
1022 |
-
|
1023 |
-
# Print retrieval results
|
1024 |
-
print("\nSearch Results:")
|
1025 |
-
for i, (doc, score) in enumerate(results):
|
1026 |
-
print(f"\nResult {i+1} (Score: {score:.4f}):")
|
1027 |
-
print(f"Source: {doc.metadata['source']}, Chunk: {doc.metadata['chunk_id']}")
|
1028 |
|
1029 |
-
|
1030 |
-
|
1031 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1032 |
|
1033 |
-
|
1034 |
-
|
1035 |
-
|
1036 |
-
qa_generator = setup_qa_system(llm_model, llm_tokenizer, vector_store)
|
1037 |
|
1038 |
-
|
1039 |
-
|
1040 |
-
answer = qa_generator(query)
|
1041 |
-
print("\nLLM Answer:")
|
1042 |
-
print(answer)
|
1043 |
-
except Exception as e:
|
1044 |
-
print(f"Error generating answer: {e}")
|
1045 |
|
1046 |
# Clean up GPU memory
|
1047 |
if args.device == "cuda":
|
|
|
744 |
|
745 |
return model, tokenizer
|
746 |
|
747 |
+
# NEW FUNCTIONS FOR COMBINED RESULTS APPROACH
|
748 |
+
|
749 |
+
def combine_top_results(results, query, max_results=4):
|
750 |
+
"""
|
751 |
+
Combine the top search results into a single coherent context
|
752 |
+
|
753 |
+
Args:
|
754 |
+
results: List of (Document, score) tuples from retrieval
|
755 |
+
query: Original user query
|
756 |
+
max_results: Maximum number of results to combine
|
757 |
+
|
758 |
+
Returns:
|
759 |
+
String containing combined context from top results
|
760 |
+
"""
|
761 |
+
# Sort results by score (highest first) and take top N
|
762 |
+
sorted_results = sorted(results, key=lambda x: x[1], reverse=True)[:max_results]
|
763 |
+
|
764 |
+
combined_texts = []
|
765 |
+
seen_content = set() # To avoid duplicates
|
766 |
+
|
767 |
+
for doc, score in sorted_results:
|
768 |
+
# Extract relevant sentences to keep context focused
|
769 |
+
relevant_text = extract_relevant_sentences(doc.page_content, query, window_size=3)
|
770 |
+
|
771 |
+
# Skip if this exact text has been seen before
|
772 |
+
if relevant_text in seen_content:
|
773 |
+
continue
|
774 |
+
|
775 |
+
# Add source information to the text
|
776 |
+
source_name = os.path.basename(doc.metadata["source"])
|
777 |
+
text_with_source = f"{relevant_text} [Source: {source_name}]"
|
778 |
+
|
779 |
+
combined_texts.append(text_with_source)
|
780 |
+
seen_content.add(relevant_text)
|
781 |
+
|
782 |
+
# Combine all texts with clear separation
|
783 |
+
combined_context = "\n\n".join(combined_texts)
|
784 |
+
|
785 |
+
return combined_context
|
786 |
+
|
787 |
+
def setup_enhanced_qa_system(model, tokenizer, vector_store):
|
788 |
"""
|
789 |
+
Set up an enhanced QA system using the model and retriever with result combination
|
790 |
"""
|
791 |
# Create retriever
|
792 |
retriever = vector_store.as_retriever(
|
793 |
search_type="similarity",
|
794 |
+
search_kwargs={"k": 6} # Get more results than we'll use to filter better
|
795 |
)
|
796 |
|
797 |
+
# Create a function to generate answers with combined context
|
798 |
+
def generate_enhanced_answer(query):
|
799 |
+
# Get raw documents and scores
|
800 |
+
docs = vector_store.similarity_search_with_score(query, k=6)
|
|
|
|
|
|
|
|
|
801 |
|
802 |
+
# Combine the top results into a single context
|
803 |
+
combined_context = combine_top_results(docs, query, max_results=4)
|
804 |
|
805 |
+
# Create prompt with the combined context
|
806 |
prompt = f"""
|
807 |
आपको निम्नलिखित संदर्भ से जानकारी के आधार पर एक प्रश्न का उत्तर देना है।
|
808 |
+
यदि आप उत्तर नहीं जानते हैं, तो बस "मुझे नहीं पता" कहें। अपने उत्तर में सभी प्रासंगिक जानकारी का उपयोग करें।
|
809 |
|
810 |
संदर्भ:
|
811 |
+
{combined_context}
|
812 |
|
813 |
प्रश्न: {query}
|
814 |
|
|
|
833 |
do_sample=True
|
834 |
)
|
835 |
except Exception as e:
|
836 |
+
return f"Error generating response: {str(e)}", None
|
837 |
|
838 |
# Decode the generated text
|
839 |
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
841 |
# Extract just the answer part (after the prompt)
|
842 |
answer = full_response.split("उत्तर:")[-1].strip()
|
843 |
|
844 |
+
return answer, combined_context
|
845 |
|
846 |
+
return generate_enhanced_answer
|
847 |
|
848 |
# Main RAG functions
|
849 |
def index_text_files(model, tokenizer, data_dir, output_dir, device="cuda", chunk_size=500):
|
|
|
894 |
# Perform similarity search
|
895 |
results = perform_similarity_search(vector_store, query, k=k)
|
896 |
|
897 |
+
return results, vector_store
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
898 |
|
899 |
def main():
|
900 |
+
parser = argparse.ArgumentParser(description="Hindi RAG System with Combined Results")
|
901 |
parser.add_argument("--model_dir", type=str, default="/home/ubuntu/output/hindi-embeddings-custom-tokenizer/final",
|
902 |
help="Directory containing the model and tokenizer")
|
903 |
parser.add_argument("--tokenizer_dir", type=str, default="/home/ubuntu/hindi_tokenizer",
|
|
|
920 |
help="Run in interactive mode for querying")
|
921 |
parser.add_argument("--reindex", action="store_true",
|
922 |
help="Force reindexing even if index exists")
|
|
|
|
|
923 |
parser.add_argument("--llm_name", type=str, default="unsloth/Llama-3.2-1B-Instruct",
|
924 |
help="HuggingFace model name for the LLM")
|
925 |
+
parser.add_argument("--show_context", action="store_true",
|
926 |
+
help="Show the combined context sent to the LLM")
|
927 |
+
parser.add_argument("--show_raw_results", action="store_true",
|
928 |
+
help="Show the raw search results before combination")
|
929 |
args = parser.parse_args()
|
930 |
|
931 |
# Load embedding model and tokenizer
|
|
|
937 |
# Create vector store path
|
938 |
vector_store_path = os.path.join(args.output_dir, "faiss_index")
|
939 |
|
940 |
+
# Load LLM
|
941 |
+
try:
|
942 |
+
# Load LLM
|
943 |
+
llm_model, llm_tokenizer = load_llama_model(args.llm_name, args.device)
|
944 |
+
print("LLM loaded successfully for QA")
|
945 |
+
except Exception as e:
|
946 |
+
print(f"Error loading LLM: {e}")
|
947 |
+
print("Cannot proceed without LLM for this combined results approach")
|
948 |
+
return
|
|
|
|
|
|
|
|
|
|
|
949 |
|
950 |
if args.index or args.reindex:
|
951 |
# Index text files
|
|
|
953 |
embed_model, embed_tokenizer, args.data_dir, args.output_dir, args.device, args.chunk_size
|
954 |
)
|
955 |
print(f"Indexing complete. Vector store saved to {vector_store_path}")
|
956 |
+
|
957 |
+
# Load vector store for querying
|
958 |
+
embeddings = HindiSentenceEmbeddings(embed_model, embed_tokenizer, device=args.device)
|
959 |
+
vector_store = load_vector_store(vector_store_path, embeddings)
|
960 |
+
|
961 |
+
# Set up enhanced QA system
|
962 |
+
qa_generator = setup_enhanced_qa_system(llm_model, llm_tokenizer, vector_store)
|
963 |
|
964 |
if args.query:
|
965 |
+
# Process the query with the enhanced system
|
966 |
+
print(f"\nProcessing query: {args.query}")
|
|
|
|
|
967 |
|
968 |
+
# Show raw results if requested
|
969 |
+
if args.show_raw_results:
|
970 |
+
results, _ = query_text_corpus(
|
971 |
+
embed_model, embed_tokenizer, vector_store_path, args.query, args.top_k, args.device
|
972 |
+
)
|
973 |
|
974 |
+
print("\nRaw Search Results:")
|
975 |
+
for i, (doc, score) in enumerate(results):
|
976 |
+
print(f"\nResult {i+1} (Score: {score:.4f}):")
|
977 |
+
print(f"Source: {doc.metadata['source']}, Chunk: {doc.metadata['chunk_id']}")
|
978 |
+
print(f"Content: {doc.page_content[:200]}...")
|
979 |
+
|
980 |
+
# Generate enhanced answer
|
981 |
+
answer, context = qa_generator(args.query)
|
982 |
+
|
983 |
+
if args.show_context:
|
984 |
+
print("\nCombined Context:")
|
985 |
+
print(context)
|
986 |
|
987 |
+
print("\nEnhanced LLM Answer:")
|
988 |
+
print(answer)
|
|
|
|
|
|
|
|
|
|
|
|
|
989 |
|
990 |
if args.interactive:
|
991 |
print("\nInteractive mode. Enter queries (or type 'quit' to exit).")
|
992 |
|
|
|
|
|
|
|
993 |
while True:
|
994 |
print("\nEnter query:")
|
995 |
query = input()
|
|
|
1000 |
if query.lower() == 'quit':
|
1001 |
break
|
1002 |
|
1003 |
+
# Show raw results if requested
|
1004 |
+
if args.show_raw_results:
|
1005 |
+
results, _ = query_text_corpus(
|
1006 |
+
embed_model, embed_tokenizer, vector_store_path, query, args.top_k, args.device
|
1007 |
+
)
|
|
|
|
|
|
|
|
|
|
|
1008 |
|
1009 |
+
print("\nRaw Search Results:")
|
1010 |
+
for i, (doc, score) in enumerate(results):
|
1011 |
+
print(f"\nResult {i+1} (Score: {score:.4f}):")
|
1012 |
+
print(f"Source: {doc.metadata['source']}, Chunk: {doc.metadata['chunk_id']}")
|
1013 |
+
print(f"Content: {doc.page_content[:200]}...")
|
1014 |
+
|
1015 |
+
# Process the query
|
1016 |
+
print(f"\nProcessing query: {query}")
|
1017 |
+
answer, context = qa_generator(query)
|
1018 |
|
1019 |
+
if args.show_context:
|
1020 |
+
print("\nCombined Context:")
|
1021 |
+
print(context)
|
|
|
1022 |
|
1023 |
+
print("\nEnhanced LLM Answer:")
|
1024 |
+
print(answer)
|
|
|
|
|
|
|
|
|
|
|
1025 |
|
1026 |
# Clean up GPU memory
|
1027 |
if args.device == "cuda":
|
hindi-rag-system.py.amltmp
CHANGED
@@ -744,35 +744,71 @@ def load_llama_model(model_name="unsloth/Llama-3.2-1B-Instruct", device="cuda"):
|
|
744 |
|
745 |
return model, tokenizer
|
746 |
|
747 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
748 |
"""
|
749 |
-
Set up
|
750 |
"""
|
751 |
# Create retriever
|
752 |
retriever = vector_store.as_retriever(
|
753 |
search_type="similarity",
|
754 |
-
search_kwargs={"k":
|
755 |
)
|
756 |
|
757 |
-
# Create a function to generate answers
|
758 |
-
def
|
759 |
-
#
|
760 |
-
|
761 |
-
docs = retriever.invoke(query)
|
762 |
-
except:
|
763 |
-
# Fallback to older method if invoke isn't available
|
764 |
-
docs = retriever.get_relevant_documents(query)
|
765 |
|
766 |
-
#
|
767 |
-
|
768 |
|
769 |
-
# Create prompt
|
770 |
prompt = f"""
|
771 |
आपको निम्नलिखित संदर्भ से जानकारी के आधार पर एक प्रश्न का उत्तर देना है।
|
772 |
-
यदि आप उत्तर नहीं जानते हैं, तो बस "मुझे नहीं पता" कहें।
|
773 |
|
774 |
संदर्भ:
|
775 |
-
{
|
776 |
|
777 |
प्रश्न: {query}
|
778 |
|
@@ -797,7 +833,7 @@ def setup_qa_system(model, tokenizer, vector_store):
|
|
797 |
do_sample=True
|
798 |
)
|
799 |
except Exception as e:
|
800 |
-
return f"Error generating response: {str(e)}"
|
801 |
|
802 |
# Decode the generated text
|
803 |
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
@@ -805,9 +841,9 @@ def setup_qa_system(model, tokenizer, vector_store):
|
|
805 |
# Extract just the answer part (after the prompt)
|
806 |
answer = full_response.split("उत्तर:")[-1].strip()
|
807 |
|
808 |
-
return answer
|
809 |
|
810 |
-
return
|
811 |
|
812 |
# Main RAG functions
|
813 |
def index_text_files(model, tokenizer, data_dir, output_dir, device="cuda", chunk_size=500):
|
@@ -858,54 +894,10 @@ def query_text_corpus(model, tokenizer, vector_store_path, query, k=6, device="c
|
|
858 |
# Perform similarity search
|
859 |
results = perform_similarity_search(vector_store, query, k=k)
|
860 |
|
861 |
-
|
862 |
-
processed_results = []
|
863 |
-
seen_chunks = set()
|
864 |
-
|
865 |
-
for doc, score in results:
|
866 |
-
chunk_id = doc.metadata["chunk_id"]
|
867 |
-
source = doc.metadata["source"]
|
868 |
-
|
869 |
-
# Skip if we've already included this chunk
|
870 |
-
if (source, chunk_id) in seen_chunks:
|
871 |
-
continue
|
872 |
-
|
873 |
-
seen_chunks.add((source, chunk_id))
|
874 |
-
|
875 |
-
# Try to find adjacent chunks and combine them
|
876 |
-
combined_content = doc.page_content
|
877 |
-
|
878 |
-
# Look for adjacent chunks in results (both previous and next)
|
879 |
-
for adj_id in [chunk_id-1, chunk_id+1]:
|
880 |
-
for other_doc, _ in results:
|
881 |
-
if (other_doc.metadata["source"] == source and
|
882 |
-
other_doc.metadata["chunk_id"] == adj_id and
|
883 |
-
(source, adj_id) not in seen_chunks):
|
884 |
-
|
885 |
-
# Add the adjacent chunk content
|
886 |
-
if adj_id < chunk_id: # Previous chunk
|
887 |
-
combined_content = other_doc.page_content + " " + combined_content
|
888 |
-
else: # Next chunk
|
889 |
-
combined_content = combined_content + " " + other_doc.page_content
|
890 |
-
|
891 |
-
seen_chunks.add((source, adj_id))
|
892 |
-
|
893 |
-
# Create a new document with combined content
|
894 |
-
combined_doc = Document(
|
895 |
-
page_content=combined_content,
|
896 |
-
metadata={
|
897 |
-
"source": source,
|
898 |
-
"chunk_id": chunk_id,
|
899 |
-
"is_combined": True if combined_content != doc.page_content else False
|
900 |
-
}
|
901 |
-
)
|
902 |
-
|
903 |
-
processed_results.append((combined_doc, score))
|
904 |
-
|
905 |
-
return processed_results, vector_store
|
906 |
|
907 |
def main():
|
908 |
-
parser = argparse.ArgumentParser(description="Hindi RAG System with
|
909 |
parser.add_argument("--model_dir", type=str, default="/home/ubuntu/output/hindi-embeddings-custom-tokenizer/final",
|
910 |
help="Directory containing the model and tokenizer")
|
911 |
parser.add_argument("--tokenizer_dir", type=str, default="/home/ubuntu/hindi_tokenizer",
|
@@ -928,10 +920,12 @@ def main():
|
|
928 |
help="Run in interactive mode for querying")
|
929 |
parser.add_argument("--reindex", action="store_true",
|
930 |
help="Force reindexing even if index exists")
|
931 |
-
parser.add_argument("--qa", action="store_true",
|
932 |
-
help="Use LLM for question answering instead of just retrieval")
|
933 |
parser.add_argument("--llm_name", type=str, default="unsloth/Llama-3.2-1B-Instruct",
|
934 |
help="HuggingFace model name for the LLM")
|
|
|
|
|
|
|
|
|
935 |
args = parser.parse_args()
|
936 |
|
937 |
# Load embedding model and tokenizer
|
@@ -943,20 +937,15 @@ def main():
|
|
943 |
# Create vector store path
|
944 |
vector_store_path = os.path.join(args.output_dir, "faiss_index")
|
945 |
|
946 |
-
# Load LLM
|
947 |
-
|
948 |
-
|
949 |
-
|
950 |
-
|
951 |
-
|
952 |
-
|
953 |
-
|
954 |
-
|
955 |
-
print("LLM loaded successfully for QA")
|
956 |
-
except Exception as e:
|
957 |
-
print(f"Error loading LLM: {e}")
|
958 |
-
print("Falling back to retrieval-only mode")
|
959 |
-
args.qa = False
|
960 |
|
961 |
if args.index or args.reindex:
|
962 |
# Index text files
|
@@ -964,47 +953,43 @@ def main():
|
|
964 |
embed_model, embed_tokenizer, args.data_dir, args.output_dir, args.device, args.chunk_size
|
965 |
)
|
966 |
print(f"Indexing complete. Vector store saved to {vector_store_path}")
|
967 |
-
|
968 |
-
|
969 |
-
|
970 |
-
|
|
|
|
|
|
|
971 |
|
972 |
if args.query:
|
973 |
-
#
|
974 |
-
|
975 |
-
embed_model, embed_tokenizer, vector_store_path, args.query, args.top_k, args.device
|
976 |
-
)
|
977 |
|
978 |
-
#
|
979 |
-
|
980 |
-
|
981 |
-
|
982 |
-
|
983 |
|
984 |
-
|
985 |
-
|
986 |
-
|
987 |
-
|
988 |
-
|
989 |
-
|
990 |
-
|
991 |
-
|
|
|
|
|
|
|
|
|
992 |
|
993 |
-
|
994 |
-
|
995 |
-
try:
|
996 |
-
answer = qa_generator(args.query)
|
997 |
-
print("\nLLM Answer:")
|
998 |
-
print(answer)
|
999 |
-
except Exception as e:
|
1000 |
-
print(f"Error generating answer: {e}")
|
1001 |
|
1002 |
if args.interactive:
|
1003 |
print("\nInteractive mode. Enter queries (or type 'quit' to exit).")
|
1004 |
|
1005 |
-
# For the first query, load vector store
|
1006 |
-
vector_store = None
|
1007 |
-
|
1008 |
while True:
|
1009 |
print("\nEnter query:")
|
1010 |
query = input()
|
@@ -1015,33 +1000,28 @@ def main():
|
|
1015 |
if query.lower() == 'quit':
|
1016 |
break
|
1017 |
|
1018 |
-
#
|
1019 |
-
|
1020 |
-
|
1021 |
-
|
1022 |
-
|
1023 |
-
# Print retrieval results
|
1024 |
-
print("\nSearch Results:")
|
1025 |
-
for i, (doc, score) in enumerate(results):
|
1026 |
-
print(f"\nResult {i+1} (Score: {score:.4f}):")
|
1027 |
-
print(f"Source: {doc.metadata['source']}, Chunk: {doc.metadata['chunk_id']}")
|
1028 |
|
1029 |
-
|
1030 |
-
|
1031 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1032 |
|
1033 |
-
|
1034 |
-
|
1035 |
-
|
1036 |
-
qa_generator = setup_qa_system(llm_model, llm_tokenizer, vector_store)
|
1037 |
|
1038 |
-
|
1039 |
-
|
1040 |
-
answer = qa_generator(query)
|
1041 |
-
print("\nLLM Answer:")
|
1042 |
-
print(answer)
|
1043 |
-
except Exception as e:
|
1044 |
-
print(f"Error generating answer: {e}")
|
1045 |
|
1046 |
# Clean up GPU memory
|
1047 |
if args.device == "cuda":
|
|
|
744 |
|
745 |
return model, tokenizer
|
746 |
|
747 |
+
# NEW FUNCTIONS FOR COMBINED RESULTS APPROACH
|
748 |
+
|
749 |
+
def combine_top_results(results, query, max_results=4):
|
750 |
+
"""
|
751 |
+
Combine the top search results into a single coherent context
|
752 |
+
|
753 |
+
Args:
|
754 |
+
results: List of (Document, score) tuples from retrieval
|
755 |
+
query: Original user query
|
756 |
+
max_results: Maximum number of results to combine
|
757 |
+
|
758 |
+
Returns:
|
759 |
+
String containing combined context from top results
|
760 |
+
"""
|
761 |
+
# Sort results by score (highest first) and take top N
|
762 |
+
sorted_results = sorted(results, key=lambda x: x[1], reverse=True)[:max_results]
|
763 |
+
|
764 |
+
combined_texts = []
|
765 |
+
seen_content = set() # To avoid duplicates
|
766 |
+
|
767 |
+
for doc, score in sorted_results:
|
768 |
+
# Extract relevant sentences to keep context focused
|
769 |
+
relevant_text = extract_relevant_sentences(doc.page_content, query, window_size=3)
|
770 |
+
|
771 |
+
# Skip if this exact text has been seen before
|
772 |
+
if relevant_text in seen_content:
|
773 |
+
continue
|
774 |
+
|
775 |
+
# Add source information to the text
|
776 |
+
source_name = os.path.basename(doc.metadata["source"])
|
777 |
+
text_with_source = f"{relevant_text} [Source: {source_name}]"
|
778 |
+
|
779 |
+
combined_texts.append(text_with_source)
|
780 |
+
seen_content.add(relevant_text)
|
781 |
+
|
782 |
+
# Combine all texts with clear separation
|
783 |
+
combined_context = "\n\n".join(combined_texts)
|
784 |
+
|
785 |
+
return combined_context
|
786 |
+
|
787 |
+
def setup_enhanced_qa_system(model, tokenizer, vector_store):
|
788 |
"""
|
789 |
+
Set up an enhanced QA system using the model and retriever with result combination
|
790 |
"""
|
791 |
# Create retriever
|
792 |
retriever = vector_store.as_retriever(
|
793 |
search_type="similarity",
|
794 |
+
search_kwargs={"k": 6} # Get more results than we'll use to filter better
|
795 |
)
|
796 |
|
797 |
+
# Create a function to generate answers with combined context
|
798 |
+
def generate_enhanced_answer(query):
|
799 |
+
# Get raw documents and scores
|
800 |
+
docs = vector_store.similarity_search_with_score(query, k=6)
|
|
|
|
|
|
|
|
|
801 |
|
802 |
+
# Combine the top results into a single context
|
803 |
+
combined_context = combine_top_results(docs, query, max_results=4)
|
804 |
|
805 |
+
# Create prompt with the combined context
|
806 |
prompt = f"""
|
807 |
आपको निम्नलिखित संदर्भ से जानकारी के आधार पर एक प्रश्न का उत्तर देना है।
|
808 |
+
यदि आप उत्तर नहीं जानते हैं, तो बस "मुझे नहीं पता" कहें। अपने उत्तर में सभी प्रासंगिक जानकारी का उपयोग करें।
|
809 |
|
810 |
संदर्भ:
|
811 |
+
{combined_context}
|
812 |
|
813 |
प्रश्न: {query}
|
814 |
|
|
|
833 |
do_sample=True
|
834 |
)
|
835 |
except Exception as e:
|
836 |
+
return f"Error generating response: {str(e)}", None
|
837 |
|
838 |
# Decode the generated text
|
839 |
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
841 |
# Extract just the answer part (after the prompt)
|
842 |
answer = full_response.split("उत्तर:")[-1].strip()
|
843 |
|
844 |
+
return answer, combined_context
|
845 |
|
846 |
+
return generate_enhanced_answer
|
847 |
|
848 |
# Main RAG functions
|
849 |
def index_text_files(model, tokenizer, data_dir, output_dir, device="cuda", chunk_size=500):
|
|
|
894 |
# Perform similarity search
|
895 |
results = perform_similarity_search(vector_store, query, k=k)
|
896 |
|
897 |
+
return results, vector_store
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
898 |
|
899 |
def main():
|
900 |
+
parser = argparse.ArgumentParser(description="Hindi RAG System with Combined Results")
|
901 |
parser.add_argument("--model_dir", type=str, default="/home/ubuntu/output/hindi-embeddings-custom-tokenizer/final",
|
902 |
help="Directory containing the model and tokenizer")
|
903 |
parser.add_argument("--tokenizer_dir", type=str, default="/home/ubuntu/hindi_tokenizer",
|
|
|
920 |
help="Run in interactive mode for querying")
|
921 |
parser.add_argument("--reindex", action="store_true",
|
922 |
help="Force reindexing even if index exists")
|
|
|
|
|
923 |
parser.add_argument("--llm_name", type=str, default="unsloth/Llama-3.2-1B-Instruct",
|
924 |
help="HuggingFace model name for the LLM")
|
925 |
+
parser.add_argument("--show_context", action="store_true",
|
926 |
+
help="Show the combined context sent to the LLM")
|
927 |
+
parser.add_argument("--show_raw_results", action="store_true",
|
928 |
+
help="Show the raw search results before combination")
|
929 |
args = parser.parse_args()
|
930 |
|
931 |
# Load embedding model and tokenizer
|
|
|
937 |
# Create vector store path
|
938 |
vector_store_path = os.path.join(args.output_dir, "faiss_index")
|
939 |
|
940 |
+
# Load LLM
|
941 |
+
try:
|
942 |
+
# Load LLM
|
943 |
+
llm_model, llm_tokenizer = load_llama_model(args.llm_name, args.device)
|
944 |
+
print("LLM loaded successfully for QA")
|
945 |
+
except Exception as e:
|
946 |
+
print(f"Error loading LLM: {e}")
|
947 |
+
print("Cannot proceed without LLM for this combined results approach")
|
948 |
+
return
|
|
|
|
|
|
|
|
|
|
|
949 |
|
950 |
if args.index or args.reindex:
|
951 |
# Index text files
|
|
|
953 |
embed_model, embed_tokenizer, args.data_dir, args.output_dir, args.device, args.chunk_size
|
954 |
)
|
955 |
print(f"Indexing complete. Vector store saved to {vector_store_path}")
|
956 |
+
|
957 |
+
# Load vector store for querying
|
958 |
+
embeddings = HindiSentenceEmbeddings(embed_model, embed_tokenizer, device=args.device)
|
959 |
+
vector_store = load_vector_store(vector_store_path, embeddings)
|
960 |
+
|
961 |
+
# Set up enhanced QA system
|
962 |
+
qa_generator = setup_enhanced_qa_system(llm_model, llm_tokenizer, vector_store)
|
963 |
|
964 |
if args.query:
|
965 |
+
# Process the query with the enhanced system
|
966 |
+
print(f"\nProcessing query: {args.query}")
|
|
|
|
|
967 |
|
968 |
+
# Show raw results if requested
|
969 |
+
if args.show_raw_results:
|
970 |
+
results, _ = query_text_corpus(
|
971 |
+
embed_model, embed_tokenizer, vector_store_path, args.query, args.top_k, args.device
|
972 |
+
)
|
973 |
|
974 |
+
print("\nRaw Search Results:")
|
975 |
+
for i, (doc, score) in enumerate(results):
|
976 |
+
print(f"\nResult {i+1} (Score: {score:.4f}):")
|
977 |
+
print(f"Source: {doc.metadata['source']}, Chunk: {doc.metadata['chunk_id']}")
|
978 |
+
print(f"Content: {doc.page_content[:200]}...")
|
979 |
+
|
980 |
+
# Generate enhanced answer
|
981 |
+
answer, context = qa_generator(args.query)
|
982 |
+
|
983 |
+
if args.show_context:
|
984 |
+
print("\nCombined Context:")
|
985 |
+
print(context)
|
986 |
|
987 |
+
print("\nEnhanced LLM Answer:")
|
988 |
+
print(answer)
|
|
|
|
|
|
|
|
|
|
|
|
|
989 |
|
990 |
if args.interactive:
|
991 |
print("\nInteractive mode. Enter queries (or type 'quit' to exit).")
|
992 |
|
|
|
|
|
|
|
993 |
while True:
|
994 |
print("\nEnter query:")
|
995 |
query = input()
|
|
|
1000 |
if query.lower() == 'quit':
|
1001 |
break
|
1002 |
|
1003 |
+
# Show raw results if requested
|
1004 |
+
if args.show_raw_results:
|
1005 |
+
results, _ = query_text_corpus(
|
1006 |
+
embed_model, embed_tokenizer, vector_store_path, query, args.top_k, args.device
|
1007 |
+
)
|
|
|
|
|
|
|
|
|
|
|
1008 |
|
1009 |
+
print("\nRaw Search Results:")
|
1010 |
+
for i, (doc, score) in enumerate(results):
|
1011 |
+
print(f"\nResult {i+1} (Score: {score:.4f}):")
|
1012 |
+
print(f"Source: {doc.metadata['source']}, Chunk: {doc.metadata['chunk_id']}")
|
1013 |
+
print(f"Content: {doc.page_content[:200]}...")
|
1014 |
+
|
1015 |
+
# Process the query
|
1016 |
+
print(f"\nProcessing query: {query}")
|
1017 |
+
answer, context = qa_generator(query)
|
1018 |
|
1019 |
+
if args.show_context:
|
1020 |
+
print("\nCombined Context:")
|
1021 |
+
print(context)
|
|
|
1022 |
|
1023 |
+
print("\nEnhanced LLM Answer:")
|
1024 |
+
print(answer)
|
|
|
|
|
|
|
|
|
|
|
1025 |
|
1026 |
# Clean up GPU memory
|
1027 |
if args.device == "cuda":
|