Spaces:

mohit-raghavendra
/

gt-policy-bot

Runtime error

App Files Files Community

mohit-raghavendra commited on Nov 28, 2023

Commit

bbd7b76

1 Parent(s): aa4608e

Initial upload

Browse files

Files changed (8) hide show

README.md +1 -12
app.py +76 -0
config.yml +13 -0
gt-policy-bot.yml +28 -0
llm_client.py +41 -0
pinecone_index.py +109 -0
requirements.txt +11 -0
vectorise.py +69 -0

README.md CHANGED Viewed

@@ -1,12 +1 @@
----
-title: Gt Policy Bot
-emoji: 📊
-colorFrom: gray
-colorTo: purple
-sdk: gradio
-sdk_version: 4.1.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # gt-policy-bot

app.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import yaml
+import gradio as gr
+from typing import List
+from llm_client import PalmClient
+from pinecone_index import PinceconeIndex
+SYSTEM_MESSAGE = 'Give a precise answer to the question based on only the \
+            context and evidence and do not be verbose.'
+TOP_K = 2
+def format_prompt(question: str, evidence: List[str]):
+    evidence_string = ''
+    for i, ev in enumerate(evidence):
+        evidence_string.join(f'\n Evidence {i+1}: {ev}')
+    content = f"{SYSTEM_MESSAGE} \
+              \n ### Question:{question} \
+              \n ### Evidence: {evidence_string} \
+              \n ### Response:"
+    return content
+if __name__ == '__main__':
+    config_path = 'config.yml'
+    with open('config.yml', 'r') as file:
+        config = yaml.safe_load(file)
+    print(config)
+    data_path = config['paths']['data_path']
+    project = config['paths']['project']
+    index_name = config['pinecone']['index-name']
+    embedding_model = config['sentence-transformers']['model-name']
+    embedding_dimension = config['sentence-transformers'][
+        'embedding-dimension']
+    index = PinceconeIndex(index_name, embedding_model)
+    index.connect_index(embedding_dimension, False)
+    palm_client = PalmClient()
+    def get_answer(question: str):
+        evidence = index.query(question, top_k=TOP_K)
+        prompt_with_evidence = format_prompt(question, evidence)
+        print(prompt_with_evidence)
+        response = palm_client.generate_text(prompt_with_evidence)
+        final_output = [response] + evidence
+        return final_output
+    context_outputs = [gr.Textbox(label=f'Evidence {i+1}')
+                       for i in range(TOP_K)]
+    result_output = [gr.Textbox(label='Answer')]
+    gradio_outputs = result_output + context_outputs
+    gradio_inputs = gr.Textbox(placeholder="Enter your question...")
+    demo = gr.Interface(
+        fn=get_answer,
+        inputs=gradio_inputs,
+        # outputs=[gr.Textbox(label=f'Document {i+1}') for i in range(TOP_K)],
+        outputs=gradio_outputs,
+        title="GT Student Code of Conduct Bot",
+        description="Get LLM-powered answers to questions about the \
+            Georgia Tech Student Code of Conduct. The evidences are exerpts\
+                from the Code of Conduct."
+    )
+    demo.launch()

config.yml ADDED Viewed

	@@ -0,0 +1,13 @@

+paths:
+  project: 'code_of_conduct_1'
+  data_path: './data/code_of_conduct/'
+  chunking: 'manual'
+  auto_chunk_file: './data/code_of_conduct/code_of_conduct.csv'
+  manual_chunk_file: './data/code_of_conduct/code_of_conduct_manual.csv'
+pinecone:
+  index-name: gt-code-of-conduct
+sentence-transformers:
+  model-name: thenlper/gte-base
+  embedding-dimension: 768

gt-policy-bot.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+name: gtpb
+channels:
+    - defaults
+dependencies:
+    - matplotlib
+    - numpy
+    - imageio
+    - scikit-learn
+    - notebook
+    - pandas
+    - scipy
+    - ipywidgets
+    - statsmodels
+    - jupyterlab
+    - plotly
+    - pip
+    - tqdm
+    - pip:
+        - kaleido
+        - colab_ssh
+        - gradio
+        - faiss-cpu
+        - pinecone-client
+        - pdfminer-six
+        - sentence-transformers
+        - torch
+        - langchain
+        - python-dotenv

llm_client.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import os
+import google.generativeai as palm
+class PalmClient:
+    def __init__(self):
+        self.connect_client()
+    def connect_client(self):
+        if (not os.getenv('GOOGLE_PALM_KEY')):
+            raise Exception('Please set your Google MakerSuite API key')
+        api_key = os.getenv('GOOGLE_PALM_KEY')
+        palm.configure(api_key=api_key)
+        safety_overrides = [
+            {"category": "HARM_CATEGORY_DEROGATORY", "threshold": 4},
+            {"category": "HARM_CATEGORY_TOXICITY", "threshold": 4},
+            {"category": "HARM_CATEGORY_VIOLENCE", "threshold": 4},
+            {"category": "HARM_CATEGORY_SEXUAL", "threshold": 4},
+            {"category": "HARM_CATEGORY_MEDICAL", "threshold": 4},
+            {"category": "HARM_CATEGORY_DANGEROUS", "threshold": 4}
+            ]
+        defaults = {
+            'model': 'models/text-bison-001',
+            'temperature': 0.7,
+            'candidate_count': 1,
+            'top_k': 40,
+            'top_p': 0.95,
+            'max_output_tokens': 1024,
+            'stop_sequences': [],
+            'safety_settings': safety_overrides,
+        }
+        self.defaults = defaults
+    def generate_text(self, prompt: str) -> str:
+        response = palm.generate_text(**self.defaults, prompt=prompt)
+        return response.candidates[0]['output']

pinecone_index.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import pinecone
+import time
+import yaml
+import pandas as pd
+from langchain.document_loaders import DataFrameLoader
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores.pinecone import Pinecone
+from typing import List
+from dotenv import load_dotenv
+from pathlib import Path
+class PinceconeIndex:
+    def __init__(self, index_name: str, model_name: str):
+        self.index_name = index_name
+        self._embeddingModel = HuggingFaceEmbeddings(model_name=model_name)
+    def connect_index(self, embedding_dimension: int,
+                      delete_existing: bool = False):
+        index_name = self.index_name
+        # load pinecone env variables within Google Colab
+        if (not os.getenv('PINECONE_KEY')) or (not os.getenv('PINECONE_ENV')):
+            dotenv_path = Path('/content/gt-policy-bot/config.env')
+            load_dotenv(dotenv_path=dotenv_path)
+        pinecone.init(
+            api_key=os.getenv('PINECONE_KEY'),
+            environment=os.getenv('PINECONE_ENV'),
+        )
+        if index_name in pinecone.list_indexes() and delete_existing:
+            pinecone.delete_index(index_name)
+        if index_name not in pinecone.list_indexes():
+            pinecone.create_index(index_name, dimension=embedding_dimension)
+        index = pinecone.Index(index_name)
+        pinecone.describe_index(index_name)
+        self._index = index
+    def upsert_docs(self, df: pd.DataFrame, text_col: str):
+        loader = DataFrameLoader(df, page_content_column=text_col)
+        docs = loader.load()
+        Pinecone.from_documents(docs, self._embeddingModel,
+                                index_name=self.index_name)
+    def get_embedding_model(self):
+        return self._embeddingModel
+    def get_index_name(self):
+        return self.index_name
+    def query(self, query: str, top_k: int = 5) -> List[str]:
+        docsearch = Pinecone.from_existing_index(self.index_name,
+                                                 self._embeddingModel)
+        res = docsearch.similarity_search(query, k=top_k)
+        return [doc.page_content for doc in res]
+if __name__ == '__main__':
+    config_path = 'config.yml'
+    with open('config.yml', 'r') as file:
+        config = yaml.safe_load(file)
+    print(config)
+    data_path = config['paths']['data_path']
+    project = config['paths']['project']
+    format = '.csv'
+    index_name = config['pinecone']['index-name']
+    embedding_model = config['sentence-transformers'][
+        'model-name']
+    embedding_dimension = config['sentence-transformers'][
+        'embedding-dimension']
+    delete_existing = True
+    if config['paths']['chunking'] == 'manual':
+        print("Using manual chunking")
+        file_path_embedding = config['paths']['manual_chunk_file']
+        df = pd.read_csv(file_path_embedding, header=None, names=['chunks'])
+    else:
+        print("Using automatic chunking")
+        file_path_embedding = config['paths']['auto_chunk_file']
+        df = pd.read_csv(file_path_embedding, index_col=0)
+    print(df)
+    start_time = time.time()
+    index = PinceconeIndex(index_name, embedding_model)
+    index.connect_index(embedding_dimension, delete_existing)
+    index.upsert_docs(df, 'chunks')
+    end_time = time.time()
+    print(f'Indexing took {end_time - start_time} seconds')
+    index = PinceconeIndex(index_name, embedding_model)
+    index.connect_index(embedding_dimension, delete_existing=False)
+    query = "When was the student code of conduct last revised?"
+    res = index.query(query, top_k=5)
+    # assert len(res) == 5
+    print(res)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+pandas
+pinecone-client
+sentence-transformers
+torch
+tqdm
+pdfminer-six
+langchain
+gradio
+python-dotenv
+faiss-cpu
+google-generativeai

vectorise.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import tqdm
+import yaml
+import numpy as np
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+BATCH_SIZE = 2
+class Vectorizer:
+    def __init__(self, model_name: str):
+        self.model_name = model_name
+        self.model = SentenceTransformer(model_name)
+        self.batch_size = BATCH_SIZE
+    def get_query_embedding(self, query: str) -> np.ndarray:
+        return self.model.encode(query)
+    def get_embeddings(self, df: pd.DataFrame, data_col: str):
+        docs = df[data_col]
+        num_docs = len(docs)
+        embeddings = []
+        for i in tqdm.tqdm(range(0, num_docs, self.batch_size)):
+            docs_batch = docs[i: i + self.batch_size].to_list()
+            vectors_batch = self.model.encode(docs_batch).tolist()
+            embeddings.append(vectors_batch)
+        embeddings_flattened = [embedding for batch in embeddings for embedding in batch]
+        assert len(embeddings_flattened) == num_docs
+        return embeddings_flattened
+    def embed_docs(self, df: pd.DataFrame, data_col: str) -> pd.DataFrame:
+        embeddings = self.get_embeddings(df, data_col)
+        df['embeddings'] = embeddings
+        return df
+def run_vectorizer(configFilePath="config.yml"):
+    with open(configFilePath, 'r') as file:
+        config = yaml.safe_load(file)
+    print("Config File Loaded ...")
+    print(config)
+    data_path = config['paths']['data_path']
+    project = config['paths']['project']
+    format = '.csv'
+    data_col_name = 'chunks'
+    df = pd.read_csv(data_path + project + format)
+    vectorizer = Vectorizer(config['sentence-transformers']['model-name'])
+    df_embeddings = vectorizer.embed_docs(df, data_col_name)
+    print("Creation of embedding completed ...")
+    print(df_embeddings.head())
+    file_path_embedding = data_path + project + '_embedding' + format
+    df_embeddings.to_csv(file_path_embedding)
+    df_read = pd.read_csv(file_path_embedding, index_col=0)
+    assert len(df_read) == len(df_embeddings)
+    print(file_path_embedding + "created ...")
+if __name__ == "__main__":
+    run_vectorizer()