import gradio as gr
import os
import argparse
import concurrent.futures
import json
import requests
import logging
import math
import time
from itertools import cycle
from pathlib import Path
from langchain_community.llms import HuggingFaceEndpoint

import torch
import gradio as gr
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer


#url = os.environ["TGI_GAUDI_ENDPOINT_URL"]
#myport = os.environ["myport"]
URL = "198.175.88.52"
#URL = "100.81.119.213"
myport = "8080"

gaudi_device_url = f"http://{URL}:{myport}/generate"

# This assumes that TGI is running on Gaudi so we don't need to define the pipeline here.  It's like we're sending a curl command 
def text_gen(url, prompt):
    resp = requests.post(url, prompt=json.dumps(prompt))
    return resp

def text_gen_cpu(prompt):
    pipe = pipeline(task="text-generation", model="gpt2", tokenizer="gpt2", device="cpu", torch_dtype=torch.bfloat16)
    result = pipe(prompt, max_length=100, num_return_sequences=1)
    return result

demo = gr.Interface(
    fn=text_gen,
    inputs=[gaudi_device_url, "text"],
    outputs=["text"],
    
)

demo.launch()


#url = gr.Textbox(label='url', value=URL, visible=False)

# This is some demo code for using the 
#llm = HuggingFaceEndpoint(
#            endpoint_url=url,
#            max_new_tokens=1024,
#            top_k=10,
#            top_p=0.95,
#            typical_p=0.95,
#            temperature=0.01,
#            repetition_penalty=1.03,
#            streaming=True,
#        )

#result = llm.invoke("Why is the sky blue?")
#print(result)


#result = llm.invoke("Why is the sky blue?")
#print(result)