Spaces:

316usman
/

langchain_llama

Paused

File size: 5,263 Bytes

import streamlit as st
import langchain
import pinecone
import transformers
import pinecone
import accelerate
from torch import cuda, bfloat16
from transformers import pipeline

from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings import CohereEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain import LLMChain, PromptTemplate
from transformers import LlamaForCausalLM, LlamaTokenizer



st.title("Language Model Chain")
PINECONE_API_KEY = '80414b32-6e4f-40d5-aa3e-f9d09535006c'
PINECONE_API_ENV = 'northamerica-northeast1-gcp'
cohere_api_key = 'VQBpxCtpSiu3PLUyBBkNIdyQaM5qM8svfmnD3L4h'
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
index_name = "langchain"
embeddings = CohereEmbeddings(cohere_api_key=cohere_api_key)
index = pinecone.Index("langchain")
print ("Program Started")
# selected_model = st.selectbox("Select Model", ["decapoda-research/llama-7b-hf", "chainyo/alpaca-lora-7b"])

# # Display the selected model
# st.write("Selected Model:", selected_model)

model_loaded = False
model = None    
repo_id="decapoda-research/llama-7b-hf"

@st.cache(allow_output_mutation=True)
def load_model():
    config = transformers.AutoConfig.from_pretrained(repo_id)
    with accelerate.init_empty_weights():
        fake_model = transformers.AutoModelForCausalLM.from_config(config)
    device_map = accelerate.infer_auto_device_map(fake_model)
    model = transformers.LlamaForCausalLM.from_pretrained(
        repo_id,
        device_map="auto",
        load_in_8bit=True,
        cache_dir="./cache",
    )
    tokenizer = LlamaTokenizer.from_pretrained(repo_id)
    return model, tokenizer

print ("Model Loaded")
# Initialize session state variables
if "model_loaded" not in st.session_state:
    st.session_state["model_loaded"] = False
if "model" not in st.session_state:
    st.session_state["model"] = None
if "tokenizer" not in st.session_state:
    st.session_state["tokenizer"] = None

# Display the "Load Model" button
if not st.session_state["model_loaded"]:
    if st.button("Load Model"):
        model1, tokenizer1 = load_model()
        st.session_state["model"] = model1
        st.session_state["tokenizer"] = tokenizer1
        st.session_state["model_loaded"] = True
else:
    model1 = st.session_state["model"]
    tokenizer1 = st.session_state["tokenizer"]


if st.session_state["model_loaded"]:
# Set up initial values for pipeline parameters
    temperature = st.slider("Temperature  'randomness' of outputs, 0.0 is the min and 1.0 the max", min_value=0.0, max_value=1.0, value=0.1, step=0.1)
    top_p = st.slider("Top P select from top tokens whose probability add up to 15%", min_value=0.0, max_value=1.0, value=0.1, step=0.1)
    top_k = st.slider("Top K select from top 0 tokens (because zero, relies on top_p)", min_value=0, max_value=100, value=20, step=1)
    max_new_tokens = st.slider("Max New Tokens  max number of tokens to generate in the output", min_value=0, max_value=512, value=256, step=1)
    repetition_penalty = st.slider("Repetition Penalty without this output begins repeating", min_value=0.0, max_value=2.0)
    #Number of retrieved documents
    num_of_docs = st.selectbox("Number of Options", range(2, 11), index=0)
    
    query = st.text_area("Query Text", height=150)
    show_documents = st.checkbox("Show Retrieved Documents")
    # Set-up the Template
    template = """Given the question "{instruction}" and it's relevant answers as "{answers}", summarize the answers in context of the question"""
    prompt = PromptTemplate(input_variables=["instruction","answers"], template=template)
    
    
    if st.button("Generate Text"):
        #Call the pipeline and display the generated text
        generate_text = pipeline(
        model=model1, tokenizer=tokenizer1,
        return_full_text=True,  # langchain expects the full text
        task='text-generation',
        #device=device
        # we pass model parameters here too
        #stopping_criteria=stopping_criteria,  # without this model will ramble
        temperature=temperature,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
        top_p=top_p,  # select from top tokens whose probability add up to 15%
        top_k=top_k,  # select from top 0 tokens (because zero, relies on top_p)
        max_new_tokens=max_new_tokens,  # mex number of tokens to generate in the output
        repetition_penalty=repetition_penalty  # without this output begins repeating
        )  
    
    
        llm = HuggingFacePipeline(pipeline=generate_text)
        llm_chain = LLMChain(llm=llm, prompt=prompt)
    
        print ("Inside Function")
        query_vector = embeddings.embed_query(query)
        query_response = index.query(top_k=num_of_docs, include_metadata=True, vector=query_vector)
        docs=[]
        for result in query_response['matches']:
            docs.append(result['metadata']['text'])
        answers= ' '.join(docs)
        if show_documents:
              st.text_area("Retrieved Vectors", answers)
        text = (llm_chain.predict(instruction=query, answers=answers)) 
                
        st.text_area("Result",text)
        cuda.empty_cache()
        cuda.empty_cache()
        cuda.empty_cache()
        cuda.empty_cache()