Krypton / app.py
sandz7's picture
start of krypton
d364219
raw
history blame
3.18 kB
# import torch
# import gradio as gr
# from transformers import pipeline, TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
# from PIL import Image
# import requests
# import threading
DESCRIPTION = '''
<div>
<h1 style="text-align: center;">Krypton πŸ•‹</h1>
<p>This uses an Open Source model from <a href="https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers"><b>xtuner/llava-llama-3-8b-v1_1-transformers</b></a></p>
</div>
'''
# model_id = "xtuner/llava-llama-3-8b-v1_1-transformers"
# pipe = pipeline("image-to-text", model=model_id, device_map="auto")
# # Place transformers in hardware to prepare for process and generation
# llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
# llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.float16).to('cuda')
# terminators = [
# llama_tokenizer.eos_token_id,
# llama_tokenizer.convert_tokens_to_ids("<|eot_id|>")
# ]
# def krypton(prompt,
# history,
# input_image,
# max_new_tokens,
# temperature,
# num_beams,
# do_sample: bool=True):
# """
# Passes an image as input, places it for generation
# on pipeline and output is passed. This is multimodal
# """
# conversation = []
# for user, assistant in history:
# conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
# conversation.append({"role": "user", "content": prompt})
# input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)
# streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
# llava_generation_kwargs = dict(
# input_ids=input_ids,
# streamer=streamer,
# max_new_tokens=max_new_tokens,
# num_beams=num_beams,
# do_sample=do_sample
# )
# if temperature == 0.0:
# do_sample = False
# pil_image = Image.fromarray(input_image.astype('uint8'), 'RGB')
# # Pipeline generation
# outputs = pipeline()
from transformers import pipeline
from PIL import Image
import requests
import torch
import subprocess
import gradio as gr
model_id = "xtuner/llava-llama-3-8b-v1_1-transformers"
pipe = pipeline("image-to-text", model=model_id, torch_dtype=torch.float16, device=0)
def krypton(input_image):
pil_image = Image.fromarray(input_image.astype('uint8'), 'RGB')
# image = Image.open(requests.get(url, stream=True).raw)
prompt = ("<|start_header_id|>user<|end_header_id|>\n\n<image>\nWhat are these?<|eot_id|>"
"<|start_header_id|>assistant<|end_header_id|>\n\n")
outputs = pipe(input_image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
nvidia_result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE)
return outputs[0]
with gr.Blocks(fill_height=True) as demo:
gr.Markdown(DESCRIPTION)
gr.Interface(
fn=krypton,
inputs="image",
outputs="text",
fill_height=True
)
if __name__ == "__main__":
demo.launch()