File size: 3,559 Bytes
2980408
 
 
 
 
31ca135
a9ea810
31ca135
2980408
 
9ddaca4
 
 
31ca135
9ddaca4
 
a9ea810
9ddaca4
 
 
 
 
 
a9ea810
8c2e21c
 
 
 
 
359755a
8c2e21c
 
 
 
a9ea810
 
2980408
 
 
a1fee37
6195c27
 
2980408
 
 
 
 
 
 
 
6195c27
2980408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a2d3ef
78c83b7
2980408
3a2d3ef
 
 
a657bab
3a2d3ef
2980408
8c2e21c
2980408
3a2d3ef
 
 
2980408
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import gradio as gr
import openai
import pandas as pd 
import numpy as np
import csv
import os
from datasets import load_dataset
openai.api_key= os.environ.get("openai.api_key")
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import cosine_similarity
import requests
model_id = "sentence-transformers/all-MiniLM-L6-v2"
import json
hf_token = os.environ.get("hf_token")
import re
from sklearn.metrics.pairwise import cosine_similarity

def generate_embeddings(texts, model_id, hf_token):
    api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
    headers = {"Authorization": f"Bearer {hf_token}"}
    response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
    embeddings = response.json()
    return embeddings
Bio_embeddings = load_dataset('vjain/biology_AP_embeddings')
Physics_embeddings = load_dataset('vjain/AP_physics_embeddings')
df1 = pd.DataFrame(Bio_embeddings['train'])
df1["similarity"] = 0
df2 = pd.DataFrame(Physics_embeddings['train'])
df2["similarity"] = 0

dataframes = {
    "Bio_embeddings": df1,
    "TA_embeddings": df2
}
#df = pd.read_csv("TA_embeddings.csv")
#df["embedding"]=df["embedding"].apply(eval).apply(np.array)
def reply(input):
    
    input = input
    input_vector = generate_embeddings(input, model_id,hf_token)
    df["similarities"]=df["embedding"].apply(lambda x: cosine_similarity([x],[input_vector])[0][0])
    data = df.sort_values("similarities", ascending=False).head(10)
    data.to_csv("sorted.csv")
    context = []
    for i, row in data.iterrows():
        context.append(row['text'])
    context
    text = "\n".join(context)
    context = text
    prompt = f"""
                Answer the following question using the context given below.If you don't know the answer for certain, say I don't know.
                Context: {context}

                Q: {input}

                """      
    return openai.Completion.create(
                    prompt=prompt,
                    temperature=1,
                    max_tokens=500,
                    top_p=1,
                    frequency_penalty=0,
                    presence_penalty=0,
                    model="text-davinci-003"
                )["choices"][0]["text"].strip(" \n")
    

input_text = gr.inputs.Textbox(label="Enter your questions here")
text_output = gr.outputs.Textbox(label="Answer")

input_text = gr.inputs.Textbox(label="Enter your questions here", placeholder="E.g. What is DNA?")
text_output = gr.outputs.Textbox(label="Answer")

description = "Scholar Bot is a question answering system designed to provide accurate and relevant answers to questions from this book hosted by OpenStax https://openstax.org/details/books/biology-ap-courses. Simply enter your question in the text box above and Scholar Bot will use advanced natural language processing algorithms to search a large corpus of biology text to find the best answer for you. Scholar Bot uses the Sentence Transformers model to generate embeddings of text, and OpenAI's GPT-3 language model to provide answers to your questions."

ui = gr.Interface(fn=reply,
                  inputs=[input_text, csv_dropdown],
                  outputs=[text_output],
                  title="Scholar Bot",
                  description=description,
                  theme="light",
                  layout="vertical",
                  inputs_layout="stacked",
                  outputs_layout="stacked",
                  allow_flagging=False)

ui.launch()