from threading import Thread
import os
from typing import Iterator

import gradio as gr
import torch
from gradio.components import textbox
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from transformers import AutoModelForCausalLM, TextIteratorStreamer, LlamaTokenizer

from transformers import AutoTokenizer, AutoModelForCausalLM

llm = Llama.from_pretrained(
        repo_id="igor-im/flux_prompt_expander",
        filename="unsloth.Q8_0.gguf"
      # n_gpu_layers=-1, # Uncomment to use GPU acceleration
      # seed=1337, # Uncomment to set a specific seed
      # n_ctx=2048, # Uncomment to increase the context window
)

def response(prompt):
    llm_response = llm(
        prompt, # Prompt
        max_tokens=200, # Generate up to 32 tokens, set to None to generate up to the end of the context window
        echo=True # Echo the prompt back in the output
    )
    return llm_response.get('choices')[0].get('text')


interface = gr.Interface(fn=response, inputs='textbox', outputs='textbox')

interface.launch()