File size: 5,263 Bytes
2876ad0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d6850c
2876ad0
 
 
 
 
 
 
 
 
4ae8bb4
2876ad0
 
 
 
 
 
 
a8fabcc
 
 
 
 
 
 
 
 
 
 
f236b24
a8fabcc
 
 
 
 
 
 
2876ad0
f591817
2876ad0
 
 
 
 
 
 
 
 
 
 
 
eb28b82
2876ad0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78dc359
2876ad0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43a7dd6
 
 
2876ad0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import streamlit as st
import langchain
import pinecone
import transformers
import pinecone
import accelerate
from torch import cuda, bfloat16
from transformers import pipeline

from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings import CohereEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain import LLMChain, PromptTemplate
from transformers import LlamaForCausalLM, LlamaTokenizer



st.title("Language Model Chain")
PINECONE_API_KEY = '80414b32-6e4f-40d5-aa3e-f9d09535006c'
PINECONE_API_ENV = 'northamerica-northeast1-gcp'
cohere_api_key = 'VQBpxCtpSiu3PLUyBBkNIdyQaM5qM8svfmnD3L4h'
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
index_name = "langchain"
embeddings = CohereEmbeddings(cohere_api_key=cohere_api_key)
index = pinecone.Index("langchain")
print ("Program Started")
# selected_model = st.selectbox("Select Model", ["decapoda-research/llama-7b-hf", "chainyo/alpaca-lora-7b"])

# # Display the selected model
# st.write("Selected Model:", selected_model)

model_loaded = False
model = None    
repo_id="decapoda-research/llama-7b-hf"

@st.cache(allow_output_mutation=True)
def load_model():
    config = transformers.AutoConfig.from_pretrained(repo_id)
    with accelerate.init_empty_weights():
        fake_model = transformers.AutoModelForCausalLM.from_config(config)
    device_map = accelerate.infer_auto_device_map(fake_model)
    model = transformers.LlamaForCausalLM.from_pretrained(
        repo_id,
        device_map="auto",
        load_in_8bit=True,
        cache_dir="./cache",
    )
    tokenizer = LlamaTokenizer.from_pretrained(repo_id)
    return model, tokenizer

print ("Model Loaded")
# Initialize session state variables
if "model_loaded" not in st.session_state:
    st.session_state["model_loaded"] = False
if "model" not in st.session_state:
    st.session_state["model"] = None
if "tokenizer" not in st.session_state:
    st.session_state["tokenizer"] = None

# Display the "Load Model" button
if not st.session_state["model_loaded"]:
    if st.button("Load Model"):
        model1, tokenizer1 = load_model()
        st.session_state["model"] = model1
        st.session_state["tokenizer"] = tokenizer1
        st.session_state["model_loaded"] = True
else:
    model1 = st.session_state["model"]
    tokenizer1 = st.session_state["tokenizer"]


if st.session_state["model_loaded"]:
# Set up initial values for pipeline parameters
    temperature = st.slider("Temperature  'randomness' of outputs, 0.0 is the min and 1.0 the max", min_value=0.0, max_value=1.0, value=0.1, step=0.1)
    top_p = st.slider("Top P select from top tokens whose probability add up to 15%", min_value=0.0, max_value=1.0, value=0.1, step=0.1)
    top_k = st.slider("Top K select from top 0 tokens (because zero, relies on top_p)", min_value=0, max_value=100, value=20, step=1)
    max_new_tokens = st.slider("Max New Tokens  max number of tokens to generate in the output", min_value=0, max_value=512, value=256, step=1)
    repetition_penalty = st.slider("Repetition Penalty without this output begins repeating", min_value=0.0, max_value=2.0)
    #Number of retrieved documents
    num_of_docs = st.selectbox("Number of Options", range(2, 11), index=0)
    
    query = st.text_area("Query Text", height=150)
    show_documents = st.checkbox("Show Retrieved Documents")
    # Set-up the Template
    template = """Given the question "{instruction}" and it's relevant answers as "{answers}", summarize the answers in context of the question"""
    prompt = PromptTemplate(input_variables=["instruction","answers"], template=template)
    
    
    if st.button("Generate Text"):
        #Call the pipeline and display the generated text
        generate_text = pipeline(
        model=model1, tokenizer=tokenizer1,
        return_full_text=True,  # langchain expects the full text
        task='text-generation',
        #device=device
        # we pass model parameters here too
        #stopping_criteria=stopping_criteria,  # without this model will ramble
        temperature=temperature,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
        top_p=top_p,  # select from top tokens whose probability add up to 15%
        top_k=top_k,  # select from top 0 tokens (because zero, relies on top_p)
        max_new_tokens=max_new_tokens,  # mex number of tokens to generate in the output
        repetition_penalty=repetition_penalty  # without this output begins repeating
        )  
    
    
        llm = HuggingFacePipeline(pipeline=generate_text)
        llm_chain = LLMChain(llm=llm, prompt=prompt)
    
        print ("Inside Function")
        query_vector = embeddings.embed_query(query)
        query_response = index.query(top_k=num_of_docs, include_metadata=True, vector=query_vector)
        docs=[]
        for result in query_response['matches']:
            docs.append(result['metadata']['text'])
        answers= ' '.join(docs)
        if show_documents:
              st.text_area("Retrieved Vectors", answers)
        text = (llm_chain.predict(instruction=query, answers=answers)) 
                
        st.text_area("Result",text)
        cuda.empty_cache()
        cuda.empty_cache()
        cuda.empty_cache()
        cuda.empty_cache()