Spaces:
Sleeping
Sleeping
from fastapi import FastAPI | |
import os | |
from typing import Union | |
from custom_llm import CustomLLM | |
from pydantic import BaseModel | |
from langchain.prompts import PromptTemplate | |
from langchain_huggingface import HuggingFacePipeline | |
from langchain_huggingface import HuggingFaceEndpoint | |
class ConversationPost(BaseModel): | |
tenant: Union[str, None] = None | |
module: Union[str, None] = None | |
question: str | |
class InferencePost(BaseModel): | |
question: str | |
with_template: Union[str, None] = None | |
class LLMPost(BaseModel): | |
model: str | |
question: str | |
API_TOKEN = os.environ['HF_API_KEY'] | |
os.environ["HUGGINGFACEHUB_API_TOKEN"] = API_TOKEN | |
app = FastAPI() | |
prompt_qwen = PromptTemplate.from_template("""<|im_start|>system | |
Kamu adalah Asisten AI yang dikembangkan oleh Jonthan Jordan. Answer strictly in Bahasa Indonesia<|im_end|> | |
<|im_start|>user | |
{question}<|im_end|> | |
<|im_start|>assistant | |
""") | |
prompt_llama = PromptTemplate.from_template("""<|start_header_id|>system<|end_header_id|> | |
Kamu adalah Asisten AI yang dikembangkan oleh Jonthan Jordan. Answer strictly in Bahasa Indonesia<|eot_id|><|start_header_id|>user<|end_header_id|> | |
{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|> | |
""") | |
# llm = prompt | HuggingFacePipeline.from_model_id( | |
# model_id="Qwen/Qwen2-1.5B-Instruct", | |
# task="text-generation", | |
# pipeline_kwargs={ | |
# "max_new_tokens": 150, | |
# "return_full_text":False | |
# }, | |
# ) | |
llama = HuggingFaceEndpoint( | |
repo_id="meta-llama/Meta-Llama-3-8B-Instruct", | |
task="text-generation", | |
max_new_tokens=4096, | |
do_sample=False, | |
) | |
qwen = HuggingFaceEndpoint( | |
repo_id="Qwen/Qwen1.5-4B-Chat", | |
task="text-generation", | |
max_new_tokens=4096, | |
do_sample=False, | |
) | |
qwen2 = HuggingFaceEndpoint( | |
repo_id="Qwen/Qwen2-1.5B-Instruct", | |
task="text-generation", | |
max_new_tokens=4096, | |
do_sample=False, | |
) | |
llm = prompt_qwen | qwen | |
llm2 = prompt_llama | llama | |
llm3 = prompt_qwen | qwen2 | |
# llm = prompt | CustomLLM(repo_id="Qwen/Qwen-VL-Chat", model_type='text-generation', api_token=API_TOKEN, max_new_tokens=150).bind(stop=['<|im_end|>']) | |
def greet_json(): | |
return {"Hello": "World!"} | |
async def chat(data: LLMPost): | |
if data.model == 'llama': | |
return {"data":llama.invoke(data.question)} | |
elif data.model == 'qwen': | |
return {"data":qwen.invoke(data.question)} | |
else: | |
return {"data":qwen2.invoke(data.question)} | |
async def conversation(data : ConversationPost): | |
return {"output":llm.invoke({"question":data.question})} | |
async def conversation2(data : ConversationPost): | |
return {"output":llm2.invoke({"question":data.question})} | |
async def conversation3(data : ConversationPost): | |
return {"output":llm3.invoke({"question":data.question})} | |
async def inference(data : InferencePost): | |
if data.with_template == 'llama': | |
out = llm2.invoke(data.question) | |
elif data.with_template == 'qwen': | |
out = llm.invoke(data.question) | |
elif data.with_template == 'qwen2': | |
out = llm3.invoke(data.question) | |
else: | |
out = llama.invoke(data.question) | |
return {"output":out} |