import gradio as gr
import re
import time
import streamlit as st
from transformers import pipeline, Conversation, AutoTokenizer
#"meta-llama/Llama-2-13b-chat-hf"
my_config = {'model_name': "BramVanroy/Llama-2-13b-chat-dutch", 'do_sample': True, 'temperature': 0.1, 'repetition_penalty': 1.1, 'max_new_tokens': 500}

print(f"Loading the model: {my_config['model_name']}....")
time_load_model_start = time.time()

print(time_load_model_start)

# Load the model and tokenizer outside of the functions
llm = pipeline("text-generation",
                model=my_config['model_name'],
                tokenizer=AutoTokenizer.from_pretrained(my_config['model_name']),
                do_sample=my_config['do_sample'],
                temperature=my_config['temperature'],
                repetition_penalty=my_config['repetition_penalty'],
                max_new_tokens=my_config['max_new_tokens']
              )
time_load_model_end = time.time()
elapsed_time = time_load_model_end - time_load_model_start
print(f"Elapsed time to load the model: {elapsed_time:.2f} sec")

def get_answer(chatbot, input_text):
    start_time = time.time()
    print(f"Processing the input\n {input_text}\n")
    print('Processing the answer....')
    conversation = Conversation(input_text)
    print(f"Conversation(input_text): {conversation}")
    output = (chatbot(conversation))[1]['content']
    elapsed_time = time.time() - start_time
    #Add the last print statement to the output variable
    output += f"\nAnswered in {elapsed_time:.1f} seconds, Nr generated words: {count_words(output)}"
    
    return output


#gr.ChatInterface(get_answer(llm, text)).launch()
demo = gr.Interface(fn=get_answer, inputs="text", outputs="text")
demo.launch()