# import torch | |
# import gradio as gr | |
# from transformers import pipeline, TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM | |
# from PIL import Image | |
# import requests | |
# import threading | |
DESCRIPTION = ''' | |
<div> | |
<h1 style="text-align: center;">Krypton π</h1> | |
<p>This uses an Open Source model from <a href="https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers"><b>xtuner/llava-llama-3-8b-v1_1-transformers</b></a></p> | |
</div> | |
''' | |
# model_id = "xtuner/llava-llama-3-8b-v1_1-transformers" | |
# pipe = pipeline("image-to-text", model=model_id, device_map="auto") | |
# # Place transformers in hardware to prepare for process and generation | |
# llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") | |
# llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.float16).to('cuda') | |
# terminators = [ | |
# llama_tokenizer.eos_token_id, | |
# llama_tokenizer.convert_tokens_to_ids("<|eot_id|>") | |
# ] | |
# def krypton(prompt, | |
# history, | |
# input_image, | |
# max_new_tokens, | |
# temperature, | |
# num_beams, | |
# do_sample: bool=True): | |
# """ | |
# Passes an image as input, places it for generation | |
# on pipeline and output is passed. This is multimodal | |
# """ | |
# conversation = [] | |
# for user, assistant in history: | |
# conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}]) | |
# conversation.append({"role": "user", "content": prompt}) | |
# input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device) | |
# streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True) | |
# llava_generation_kwargs = dict( | |
# input_ids=input_ids, | |
# streamer=streamer, | |
# max_new_tokens=max_new_tokens, | |
# num_beams=num_beams, | |
# do_sample=do_sample | |
# ) | |
# if temperature == 0.0: | |
# do_sample = False | |
# pil_image = Image.fromarray(input_image.astype('uint8'), 'RGB') | |
# # Pipeline generation | |
# outputs = pipeline() | |
from transformers import pipeline | |
from PIL import Image | |
import requests | |
import torch | |
import subprocess | |
import gradio as gr | |
model_id = "xtuner/llava-llama-3-8b-v1_1-transformers" | |
pipe = pipeline("image-to-text", model=model_id, torch_dtype=torch.float16, device=0) | |
def krypton(input_image): | |
pil_image = Image.fromarray(input_image.astype('uint8'), 'RGB') | |
# image = Image.open(requests.get(url, stream=True).raw) | |
prompt = ("<|start_header_id|>user<|end_header_id|>\n\n<image>\nWhat are these?<|eot_id|>" | |
"<|start_header_id|>assistant<|end_header_id|>\n\n") | |
outputs = pipe(input_image, prompt=prompt, generate_kwargs={"max_new_tokens": 200}) | |
nvidia_result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE) | |
return outputs[0] | |
with gr.Blocks(fill_height=True) as demo: | |
gr.Markdown(DESCRIPTION) | |
gr.Interface( | |
fn=krypton, | |
inputs="image", | |
outputs="text", | |
fill_height=True | |
) | |
if __name__ == "__main__": | |
demo.launch() | |