Spaces:
Sleeping
Sleeping
File size: 4,367 Bytes
832c4a9 658af68 b9e87be 2bb32bf b9e87be 658af68 8fc2b5b 7d93b52 832c4a9 7d93b52 5272e1b 832c4a9 658af68 832c4a9 aa6fb2d 832c4a9 658af68 ee7c5db 658af68 ee7c5db 658af68 ee7c5db aa6fb2d 832c4a9 ee7c5db 832c4a9 aa6fb2d 832c4a9 90efbba aa6fb2d 832c4a9 7011baa 832c4a9 7011baa 832c4a9 aa6fb2d 90efbba 832c4a9 aa6fb2d ee7c5db 7d93b52 ee7c5db 832c4a9 ee7c5db 0af3958 658af68 ee7c5db 658af68 ee7c5db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import spaces # 必须在最顶部导入
import gradio as gr
import os
# 获取 Hugging Face 访问令牌
hf_token = os.getenv("HF_API_TOKEN")
# 定义模型名称
model_name = "larry1129/WooWoof_AI"
# 定义全局变量用于缓存模型和分词器
model = None
tokenizer = None
# 定义提示生成函数
def generate_prompt(instruction, input_text="", output_text=None):
if input_text:
prompt = f"""### Instruction:
{instruction}
### Input:
{input_text}
### Response:
"""
else:
prompt = f"""### Instruction:
{instruction}
### Response:
"""
if output_text:
prompt += f"{output_text}{tokenizer.eos_token}"
return prompt
# 定义生成响应的函数,并使用 @spaces.GPU 装饰
@spaces.GPU(duration=120)
def generate_response(instruction, input_text):
global model, tokenizer
if model is None:
# 在函数内部导入需要 GPU 的库
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
# 加载模型
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
torch_dtype=torch.float16,
use_auth_token=hf_token,
)
# 设置 pad_token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
# 切换到评估模式
model.eval()
else:import spaces # 必须在最顶部导入
import gradio as gr
import os
# 获取 Hugging Face 访问令牌
hf_token = os.getenv("HF_API_TOKEN")
# 定义基础模型名称
base_model_name = "larry1129/meta-llama-3.1-8b-bnb-4bit"
# 定义 adapter 模型名称
adapter_model_name = "larry1129/WooWoof_AI"
# 定义全局变量用于缓存模型和分词器
model = None
tokenizer = None
# 定义提示生成函数
def generate_prompt(instruction, input_text=""):
if input_text:
prompt = f"""### Instruction:
{instruction}
### Input:
{input_text}
### Response:
"""
else:
prompt = f"""### Instruction:
{instruction}
### Response:
"""
return prompt
# 定义生成响应的函数,并使用 @spaces.GPU 装饰
@spaces.GPU(duration=120)
def generate_response(instruction, input_text):
global model, tokenizer
if model is None:
# 检查 bitsandbytes 是否已安装
import importlib.util
if importlib.util.find_spec("bitsandbytes") is None:
import subprocess
subprocess.call(["pip", "install", "--upgrade", "bitsandbytes"])
# 在函数内部导入需要 GPU 的库
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
# 创建量化配置
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16
)
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_auth_token=hf_token)
# 加载基础模型
# 在函数内部导入需要的库
import torch
# 生成提示
prompt = generate_prompt(instruction, input_text)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs.get("attention_mask"),
max_new_tokens=128,
temperature=0.7,
top_p=0.95,
do_sample=True,
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = response.split("### Response:")[-1].strip()
return response
# 创建 Gradio 接口
iface = gr.Interface(
fn=generate_response,
inputs=[
gr.Textbox(lines=2, placeholder="请输入指令...", label="Instruction"),
gr.Textbox(lines=2, placeholder="如果有额外输入,请在此填写...", label="Input (可选)")
],
outputs="text",
title="WooWoof AI 交互式聊天",
description="基于 LLAMA 3.1 的大语言模型,支持指令和可选输入。",
allow_flagging="never"
)
# 启动 Gradio 接口
iface.launch()
|