import gradio as gr import os import argparse import concurrent.futures import json import requests import logging import math import time from itertools import cycle from pathlib import Path from langchain_community.llms import HuggingFaceEndpoint import torch import gradio as gr from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer #url = os.environ["TGI_GAUDI_ENDPOINT_URL"] #myport = os.environ["myport"] URL = "198.175.88.52" #URL = "100.81.119.213" myport = "8080" gaudi_device_url = f"http://{URL}:{myport}/generate" # This assumes that TGI is running on Gaudi so we don't need to define the pipeline here. It's like we're sending a curl command def text_gen(url, prompt): resp = requests.post(url, prompt=json.dumps(prompt)) return resp def text_gen_cpu(prompt): pipe = pipeline(task="text-generation", model="gpt2", tokenizer="gpt2", device="cpu", torch_dtype=torch.bfloat16) result = pipe(prompt, max_length=100, num_return_sequences=1) return result demo = gr.Interface( fn=text_gen, inputs=[gaudi_device_url, "text"], outputs=["text"], ) demo.launch() #url = gr.Textbox(label='url', value=URL, visible=False) # This is some demo code for using the #llm = HuggingFaceEndpoint( # endpoint_url=url, # max_new_tokens=1024, # top_k=10, # top_p=0.95, # typical_p=0.95, # temperature=0.01, # repetition_penalty=1.03, # streaming=True, # ) #result = llm.invoke("Why is the sky blue?") #print(result) #result = llm.invoke("Why is the sky blue?") #print(result)