Spaces:
Running
on
Zero
Running
on
Zero
Upload step03_chatbot.py with huggingface_hub
Browse files- step03_chatbot.py +56 -22
step03_chatbot.py
CHANGED
|
@@ -638,6 +638,10 @@ class GenericRAGChatbot:
|
|
| 638 |
print(" - Chargement du tokenizer...")
|
| 639 |
self.generation_tokenizer = AutoTokenizer.from_pretrained(self.generation_model_name)
|
| 640 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
# Configuration du modèle selon la plateforme
|
| 642 |
model_kwargs = self._get_generation_model_config()
|
| 643 |
|
|
@@ -838,13 +842,31 @@ Instructions importantes:
|
|
| 838 |
messages.append({"role": "user", "content": user_message})
|
| 839 |
|
| 840 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 841 |
# Tokenisation
|
| 842 |
-
inputs = self.generation_tokenizer
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 848 |
|
| 849 |
# Génération streamée
|
| 850 |
from transformers import TextIteratorStreamer
|
|
@@ -858,13 +880,15 @@ Instructions importantes:
|
|
| 858 |
)
|
| 859 |
|
| 860 |
generation_kwargs = {
|
| 861 |
-
"input_ids": inputs,
|
|
|
|
| 862 |
"streamer": streamer,
|
| 863 |
-
"max_new_tokens":
|
| 864 |
"temperature": 0.7,
|
| 865 |
"do_sample": True,
|
| 866 |
-
"pad_token_id": self.generation_tokenizer.
|
| 867 |
"eos_token_id": self.generation_tokenizer.eos_token_id,
|
|
|
|
| 868 |
}
|
| 869 |
|
| 870 |
# Lancer la génération dans un thread séparé
|
|
@@ -919,33 +943,43 @@ Réponds à cette question en te basant sur le contexte fourni."""
|
|
| 919 |
|
| 920 |
# Formatage pour le modèle
|
| 921 |
try:
|
| 922 |
-
#
|
| 923 |
-
|
| 924 |
-
|
| 925 |
-
|
| 926 |
-
|
| 927 |
-
|
| 928 |
-
|
| 929 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 930 |
inputs = self.generation_tokenizer(
|
| 931 |
formatted_prompt,
|
| 932 |
return_tensors="pt",
|
| 933 |
truncation=True,
|
| 934 |
-
max_length=4096
|
|
|
|
| 935 |
)
|
| 936 |
|
| 937 |
# Déplacement vers le device
|
| 938 |
inputs = {k: v.to(self.generation_device) for k, v in inputs.items()}
|
| 939 |
|
| 940 |
-
# Génération
|
| 941 |
with torch.no_grad():
|
| 942 |
outputs = self.generation_model.generate(
|
| 943 |
-
|
| 944 |
-
|
|
|
|
| 945 |
temperature=0.7,
|
| 946 |
do_sample=True,
|
| 947 |
-
pad_token_id=self.generation_tokenizer.
|
| 948 |
eos_token_id=self.generation_tokenizer.eos_token_id,
|
|
|
|
| 949 |
)
|
| 950 |
|
| 951 |
# Décodage de la réponse
|
|
|
|
| 638 |
print(" - Chargement du tokenizer...")
|
| 639 |
self.generation_tokenizer = AutoTokenizer.from_pretrained(self.generation_model_name)
|
| 640 |
|
| 641 |
+
# Configuration correcte pour Qwen3
|
| 642 |
+
if self.generation_tokenizer.pad_token is None:
|
| 643 |
+
self.generation_tokenizer.pad_token = self.generation_tokenizer.eos_token
|
| 644 |
+
|
| 645 |
# Configuration du modèle selon la plateforme
|
| 646 |
model_kwargs = self._get_generation_model_config()
|
| 647 |
|
|
|
|
| 842 |
messages.append({"role": "user", "content": user_message})
|
| 843 |
|
| 844 |
try:
|
| 845 |
+
# Formatage manuel plus stable pour ZeroGPU
|
| 846 |
+
formatted_messages = []
|
| 847 |
+
for msg in messages:
|
| 848 |
+
if msg["role"] == "system":
|
| 849 |
+
formatted_messages.append(f"<|im_start|>system\n{msg['content']}<|im_end|>")
|
| 850 |
+
elif msg["role"] == "user":
|
| 851 |
+
formatted_messages.append(f"<|im_start|>user\n{msg['content']}<|im_end|>")
|
| 852 |
+
elif msg["role"] == "assistant":
|
| 853 |
+
formatted_messages.append(f"<|im_start|>assistant\n{msg['content']}<|im_end|>")
|
| 854 |
+
|
| 855 |
+
# Ajouter le prompt de génération
|
| 856 |
+
formatted_messages.append("<|im_start|>assistant\n")
|
| 857 |
+
formatted_prompt = "\n".join(formatted_messages)
|
| 858 |
+
|
| 859 |
# Tokenisation
|
| 860 |
+
inputs = self.generation_tokenizer(
|
| 861 |
+
formatted_prompt,
|
| 862 |
+
return_tensors="pt",
|
| 863 |
+
truncation=True,
|
| 864 |
+
max_length=4096,
|
| 865 |
+
padding=True
|
| 866 |
+
)
|
| 867 |
+
|
| 868 |
+
# Déplacement vers le device
|
| 869 |
+
inputs = {k: v.to(self.generation_device) for k, v in inputs.items()}
|
| 870 |
|
| 871 |
# Génération streamée
|
| 872 |
from transformers import TextIteratorStreamer
|
|
|
|
| 880 |
)
|
| 881 |
|
| 882 |
generation_kwargs = {
|
| 883 |
+
"input_ids": inputs["input_ids"],
|
| 884 |
+
"attention_mask": inputs["attention_mask"],
|
| 885 |
"streamer": streamer,
|
| 886 |
+
"max_new_tokens": 512,
|
| 887 |
"temperature": 0.7,
|
| 888 |
"do_sample": True,
|
| 889 |
+
"pad_token_id": self.generation_tokenizer.pad_token_id,
|
| 890 |
"eos_token_id": self.generation_tokenizer.eos_token_id,
|
| 891 |
+
"use_cache": True
|
| 892 |
}
|
| 893 |
|
| 894 |
# Lancer la génération dans un thread séparé
|
|
|
|
| 943 |
|
| 944 |
# Formatage pour le modèle
|
| 945 |
try:
|
| 946 |
+
# Formatage manuel plus stable pour ZeroGPU
|
| 947 |
+
formatted_messages = []
|
| 948 |
+
for msg in messages:
|
| 949 |
+
if msg["role"] == "system":
|
| 950 |
+
formatted_messages.append(f"<|im_start|>system\n{msg['content']}<|im_end|>")
|
| 951 |
+
elif msg["role"] == "user":
|
| 952 |
+
formatted_messages.append(f"<|im_start|>user\n{msg['content']}<|im_end|>")
|
| 953 |
+
elif msg["role"] == "assistant":
|
| 954 |
+
formatted_messages.append(f"<|im_start|>assistant\n{msg['content']}<|im_end|>")
|
| 955 |
+
|
| 956 |
+
# Ajouter le prompt de génération
|
| 957 |
+
formatted_messages.append("<|im_start|>assistant\n")
|
| 958 |
+
formatted_prompt = "\n".join(formatted_messages)
|
| 959 |
+
|
| 960 |
+
# Tokenisation avec padding et attention mask appropriés
|
| 961 |
inputs = self.generation_tokenizer(
|
| 962 |
formatted_prompt,
|
| 963 |
return_tensors="pt",
|
| 964 |
truncation=True,
|
| 965 |
+
max_length=4096,
|
| 966 |
+
padding=True
|
| 967 |
)
|
| 968 |
|
| 969 |
# Déplacement vers le device
|
| 970 |
inputs = {k: v.to(self.generation_device) for k, v in inputs.items()}
|
| 971 |
|
| 972 |
+
# Génération avec paramètres simplifiés
|
| 973 |
with torch.no_grad():
|
| 974 |
outputs = self.generation_model.generate(
|
| 975 |
+
input_ids=inputs["input_ids"],
|
| 976 |
+
attention_mask=inputs["attention_mask"],
|
| 977 |
+
max_new_tokens=512,
|
| 978 |
temperature=0.7,
|
| 979 |
do_sample=True,
|
| 980 |
+
pad_token_id=self.generation_tokenizer.pad_token_id,
|
| 981 |
eos_token_id=self.generation_tokenizer.eos_token_id,
|
| 982 |
+
use_cache=True
|
| 983 |
)
|
| 984 |
|
| 985 |
# Décodage de la réponse
|