iamironman4279 commited on
Commit
257888a
·
verified ·
1 Parent(s): de9f44f

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +13 -0
  2. app.py +69 -0
  3. embed_with_db.py +69 -0
  4. requirements.txt +0 -0
  5. vectorize.py +57 -0
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Chat Bot
3
+ emoji: 🚀
4
+ colorFrom: green
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 4.22.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from embed_with_db import get_all_collections, VECTORDB_STORE, config
3
+ from vectorize import VectorDataBase
4
+
5
+ def respond(message, chat_history, collection_name):
6
+ chain = VECTORDB_STORE(collection_name).chain()
7
+ res = chain.invoke(message)
8
+ chat_history.append((message, res))
9
+ return "", chat_history
10
+
11
+
12
+ def embed_and_store(password, collection_name, file_type, file_fields, context,page_start):
13
+ if password == config['PASSWORD_DB']:
14
+ if str(file_type)== 'string':
15
+ file_fields = context
16
+ vector_db = VectorDataBase(file_fields, collection_name, file_type, page_start=page_start)
17
+ vector_db.embedding_with_loop()
18
+ return file_fields,context
19
+ else:
20
+ raise Exception('Something went wrong')
21
+ def update_interface(file_type):
22
+ if file_type == 'PDF' or file_type == 'TEXT':
23
+ return gr.Textbox(visible= False),gr.File(label = 'Select the file',interactive= True,visible= True)
24
+ else:
25
+ return gr.Textbox(visible = True, label= 'Enter the Context', interactive= True),gr.File(visible= False)
26
+
27
+ with gr.Blocks() as demo:
28
+ with gr.Tab('Personal Chat bot'):
29
+ gr.Markdown("""
30
+ <div align='center'>RAG Application with Open Source models</div>
31
+ <div align='center'>
32
+ > You could ask anything about Me & Data Science. I hope it will find you well
33
+ </div>
34
+ """)
35
+ db_collection = gr.Dropdown(
36
+ list(get_all_collections().values()), label="Select Collection for the retriever",
37
+ value= 'Data scientist',
38
+ allow_custom_value=True)
39
+ chatbot = gr.Chatbot(height=480) # Just to fit the notebook
40
+ msg = gr.Textbox(label="Prompt", interactive= True)
41
+ btn = gr.Button("Submit")
42
+ clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")
43
+ btn.click(respond, inputs=[msg, chatbot, db_collection], outputs=[msg, chatbot])
44
+ msg.submit(respond, inputs=[msg, chatbot,db_collection], outputs=[msg, chatbot]) # Press enter to submit
45
+
46
+ with gr.Tab('Data Base and Embedding Store'):
47
+ gr.Markdown("""
48
+ <div align='center'>Store the Document | String in Database</div>
49
+
50
+ > Only admin user allowed
51
+ """)
52
+ with gr.Row():
53
+ password = gr.Textbox(label='Enter the Password')
54
+ collection_name = gr.Textbox(label='Collection Name')
55
+ page_start = gr.Textbox(label='Page Start')
56
+ file_type = gr.Dropdown(['PDF', 'TEXT', 'STRING'], label='Select File Type',
57
+ value = 'PDF')
58
+ file_fields = gr.File(visible = True, interactive=True)
59
+ context = gr.Textbox(label="Enter the Context", visible = False)
60
+ btn = gr.Button("Submit")
61
+
62
+ btn.click(embed_and_store, inputs=[password, collection_name, file_type, file_fields, context,page_start], outputs=[file_fields, context])
63
+ file_type.change(update_interface, inputs=[file_type], outputs=[context, file_fields])
64
+ gr.Markdown("""
65
+ <div align='center'>It could be helpful for making RAG application</div>
66
+ <div align='center'>| MONGODB | LANGCHAIN | HUGGINGFACE | MITSRAL |</div>
67
+ """)
68
+
69
+ demo.launch()
embed_with_db.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_mongodb import MongoDBAtlasVectorSearch
2
+ from langchain_community.embeddings import HuggingFaceEmbeddings
3
+ from pymongo import MongoClient
4
+ from langchain_core.runnables import RunnablePassthrough
5
+ from langchain_core.output_parsers import StrOutputParser
6
+ from langchain.prompts import ChatPromptTemplate
7
+ from langchain_community.llms import HuggingFaceEndpoint
8
+
9
+ import os
10
+
11
+ config= {
12
+ 'MONGODB_CONN_STRING': os.getenv('MONGODB_CONN_STRING'),
13
+ 'HUGGINGFACEHUB_API_TOKEN': os.getenv('HUGGINGFACEHUB_API_TOKEN'),
14
+ 'DB_NAME':os.getenv('DB_NAME'),
15
+ 'VECTOR_SEARCH_INDEX':os.getenv('VECTOR_SEARCH_INDEX'),
16
+ 'PASSWORD_DB': os.getenv('PASSWORD_DB')
17
+
18
+ }
19
+ client = MongoClient(config['MONGODB_CONN_STRING'])
20
+ embeddings = HuggingFaceEmbeddings(model_name= "intfloat/e5-large-v2")
21
+
22
+ llm_model = HuggingFaceEndpoint(repo_id='mistralai/Mistral-7B-Instruct-v0.2',
23
+ huggingfacehub_api_token=config['HUGGINGFACEHUB_API_TOKEN'],
24
+ temperature=0.3)
25
+
26
+ template = """
27
+ <s>[INST] Instruction:Your are a helpful chatbot who can answer all data science ,anime and manga questions.
28
+ You have to follow these rules strictly while answering the question based on context:
29
+ 1. Do not use the word context or based on context which is provided in answers.
30
+ 2. If there is no context you have to answer in 128 words not more than that.
31
+ 3. context are in series format so make your own best pattern based on that give answer.
32
+ [/INST]
33
+ context:
34
+ {context}</s>
35
+ ### QUESTION:
36
+ {question} [/INST]
37
+ """
38
+ prompt = ChatPromptTemplate.from_template(template=template)
39
+ parser = StrOutputParser()
40
+
41
+
42
+ def get_all_collections():
43
+ database = client[config['DB_NAME']]
44
+ names = database.list_collection_names()
45
+ coll_dict = {}
46
+ for name in names:
47
+ coll_dict[name] = ' '.join(str(name).capitalize().split('_'))
48
+ return coll_dict
49
+ class VECTORDB_STORE:
50
+
51
+ def __init__(self, coll_name):
52
+ collection_name = self.get_collection_name(coll_name)
53
+ collection = client[config['DB_NAME']][collection_name]
54
+ self.vectordb_store = MongoDBAtlasVectorSearch(collection =collection,
55
+ embedding= embeddings,
56
+ index_name= config['VECTOR_SEARCH_INDEX'])
57
+ @staticmethod
58
+ def get_collection_name(coll_name):
59
+ for key, value in get_all_collections().items():
60
+ if coll_name == value:
61
+ return key
62
+ return None
63
+
64
+ def chain(self):
65
+ retriever = self.vectordb_store.as_retriever(search_kwargs={"k": 10})
66
+ chain = {'context': retriever, 'question': RunnablePassthrough()} | prompt | llm_model | parser
67
+ return chain
68
+
69
+
requirements.txt ADDED
Binary file (4.07 kB). View file
 
vectorize.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_mongodb import MongoDBAtlasVectorSearch
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain_community.document_loaders import PyPDFLoader, TextLoader
4
+ from embed_with_db import embeddings, config, client
5
+ from tqdm import tqdm
6
+
7
+ class VectorDataBase():
8
+ def __init__(self, file_path, db_collection, file_type='pdf', page_start=0):
9
+ self.file_path = file_path
10
+ self.file_type= file_type
11
+ self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32)
12
+ self.db_collection = client[config['DB_NAME']][db_collection]
13
+ self.start_page = int(page_start)
14
+ def load_docs_split(self):
15
+ if str(self.file_type).lower() == 'pdf':
16
+ loader = PyPDFLoader(self.file_path)
17
+ elif str(self.file_type).lower() == 'text':
18
+ loader = TextLoader(self.file_path)
19
+ else:
20
+ loader = None
21
+ if loader:
22
+ docs = loader.load()
23
+ return self.text_splitter.split_documents(docs)
24
+ else:
25
+ return self.text_splitter.create_documents([self.file_path])
26
+
27
+ def docs_embeddings(self):
28
+ texts = self.load_docs_split()
29
+ if texts:
30
+ docsearch = MongoDBAtlasVectorSearch.from_documents(
31
+ texts,
32
+ embeddings,
33
+ collection=self.db_collection,
34
+ index_name=config['VECTOR_SEARCH_INDEX'])
35
+ print('done!')
36
+ return docsearch
37
+ else:
38
+ print('documents is not embedded')
39
+ return 'Some issues'
40
+ def add_collection_database(self,doc):
41
+ self.db_collection.insert_one(
42
+ {
43
+ 'text': doc.page_content,
44
+ 'embedding': embeddings.embed_query(doc.page_content),
45
+ 'source': doc.metadata.get('source', 'Unknown'),
46
+ 'page': doc.metadata.get('page', 0)
47
+ }
48
+ )
49
+ def embedding_with_loop(self):
50
+ docs = self.load_docs_split()
51
+ if docs:
52
+ for doc in tqdm(docs[self.start_page:]):
53
+ self.add_collection_database(doc)
54
+ print('Done')
55
+ else:
56
+ raise Exception('Some issue with it')
57
+