Spaces:
Running
on
Zero
Running
on
Zero
Upload step03_chatbot.py with huggingface_hub
Browse files- step03_chatbot.py +56 -22
step03_chatbot.py
CHANGED
@@ -638,6 +638,10 @@ class GenericRAGChatbot:
|
|
638 |
print(" - Chargement du tokenizer...")
|
639 |
self.generation_tokenizer = AutoTokenizer.from_pretrained(self.generation_model_name)
|
640 |
|
|
|
|
|
|
|
|
|
641 |
# Configuration du modèle selon la plateforme
|
642 |
model_kwargs = self._get_generation_model_config()
|
643 |
|
@@ -838,13 +842,31 @@ Instructions importantes:
|
|
838 |
messages.append({"role": "user", "content": user_message})
|
839 |
|
840 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
841 |
# Tokenisation
|
842 |
-
inputs = self.generation_tokenizer
|
843 |
-
|
844 |
-
|
845 |
-
|
846 |
-
|
847 |
-
|
|
|
|
|
|
|
|
|
848 |
|
849 |
# Génération streamée
|
850 |
from transformers import TextIteratorStreamer
|
@@ -858,13 +880,15 @@ Instructions importantes:
|
|
858 |
)
|
859 |
|
860 |
generation_kwargs = {
|
861 |
-
"input_ids": inputs,
|
|
|
862 |
"streamer": streamer,
|
863 |
-
"max_new_tokens":
|
864 |
"temperature": 0.7,
|
865 |
"do_sample": True,
|
866 |
-
"pad_token_id": self.generation_tokenizer.
|
867 |
"eos_token_id": self.generation_tokenizer.eos_token_id,
|
|
|
868 |
}
|
869 |
|
870 |
# Lancer la génération dans un thread séparé
|
@@ -919,33 +943,43 @@ Réponds à cette question en te basant sur le contexte fourni."""
|
|
919 |
|
920 |
# Formatage pour le modèle
|
921 |
try:
|
922 |
-
#
|
923 |
-
|
924 |
-
|
925 |
-
|
926 |
-
|
927 |
-
|
928 |
-
|
929 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
930 |
inputs = self.generation_tokenizer(
|
931 |
formatted_prompt,
|
932 |
return_tensors="pt",
|
933 |
truncation=True,
|
934 |
-
max_length=4096
|
|
|
935 |
)
|
936 |
|
937 |
# Déplacement vers le device
|
938 |
inputs = {k: v.to(self.generation_device) for k, v in inputs.items()}
|
939 |
|
940 |
-
# Génération
|
941 |
with torch.no_grad():
|
942 |
outputs = self.generation_model.generate(
|
943 |
-
|
944 |
-
|
|
|
945 |
temperature=0.7,
|
946 |
do_sample=True,
|
947 |
-
pad_token_id=self.generation_tokenizer.
|
948 |
eos_token_id=self.generation_tokenizer.eos_token_id,
|
|
|
949 |
)
|
950 |
|
951 |
# Décodage de la réponse
|
|
|
638 |
print(" - Chargement du tokenizer...")
|
639 |
self.generation_tokenizer = AutoTokenizer.from_pretrained(self.generation_model_name)
|
640 |
|
641 |
+
# Configuration correcte pour Qwen3
|
642 |
+
if self.generation_tokenizer.pad_token is None:
|
643 |
+
self.generation_tokenizer.pad_token = self.generation_tokenizer.eos_token
|
644 |
+
|
645 |
# Configuration du modèle selon la plateforme
|
646 |
model_kwargs = self._get_generation_model_config()
|
647 |
|
|
|
842 |
messages.append({"role": "user", "content": user_message})
|
843 |
|
844 |
try:
|
845 |
+
# Formatage manuel plus stable pour ZeroGPU
|
846 |
+
formatted_messages = []
|
847 |
+
for msg in messages:
|
848 |
+
if msg["role"] == "system":
|
849 |
+
formatted_messages.append(f"<|im_start|>system\n{msg['content']}<|im_end|>")
|
850 |
+
elif msg["role"] == "user":
|
851 |
+
formatted_messages.append(f"<|im_start|>user\n{msg['content']}<|im_end|>")
|
852 |
+
elif msg["role"] == "assistant":
|
853 |
+
formatted_messages.append(f"<|im_start|>assistant\n{msg['content']}<|im_end|>")
|
854 |
+
|
855 |
+
# Ajouter le prompt de génération
|
856 |
+
formatted_messages.append("<|im_start|>assistant\n")
|
857 |
+
formatted_prompt = "\n".join(formatted_messages)
|
858 |
+
|
859 |
# Tokenisation
|
860 |
+
inputs = self.generation_tokenizer(
|
861 |
+
formatted_prompt,
|
862 |
+
return_tensors="pt",
|
863 |
+
truncation=True,
|
864 |
+
max_length=4096,
|
865 |
+
padding=True
|
866 |
+
)
|
867 |
+
|
868 |
+
# Déplacement vers le device
|
869 |
+
inputs = {k: v.to(self.generation_device) for k, v in inputs.items()}
|
870 |
|
871 |
# Génération streamée
|
872 |
from transformers import TextIteratorStreamer
|
|
|
880 |
)
|
881 |
|
882 |
generation_kwargs = {
|
883 |
+
"input_ids": inputs["input_ids"],
|
884 |
+
"attention_mask": inputs["attention_mask"],
|
885 |
"streamer": streamer,
|
886 |
+
"max_new_tokens": 512,
|
887 |
"temperature": 0.7,
|
888 |
"do_sample": True,
|
889 |
+
"pad_token_id": self.generation_tokenizer.pad_token_id,
|
890 |
"eos_token_id": self.generation_tokenizer.eos_token_id,
|
891 |
+
"use_cache": True
|
892 |
}
|
893 |
|
894 |
# Lancer la génération dans un thread séparé
|
|
|
943 |
|
944 |
# Formatage pour le modèle
|
945 |
try:
|
946 |
+
# Formatage manuel plus stable pour ZeroGPU
|
947 |
+
formatted_messages = []
|
948 |
+
for msg in messages:
|
949 |
+
if msg["role"] == "system":
|
950 |
+
formatted_messages.append(f"<|im_start|>system\n{msg['content']}<|im_end|>")
|
951 |
+
elif msg["role"] == "user":
|
952 |
+
formatted_messages.append(f"<|im_start|>user\n{msg['content']}<|im_end|>")
|
953 |
+
elif msg["role"] == "assistant":
|
954 |
+
formatted_messages.append(f"<|im_start|>assistant\n{msg['content']}<|im_end|>")
|
955 |
+
|
956 |
+
# Ajouter le prompt de génération
|
957 |
+
formatted_messages.append("<|im_start|>assistant\n")
|
958 |
+
formatted_prompt = "\n".join(formatted_messages)
|
959 |
+
|
960 |
+
# Tokenisation avec padding et attention mask appropriés
|
961 |
inputs = self.generation_tokenizer(
|
962 |
formatted_prompt,
|
963 |
return_tensors="pt",
|
964 |
truncation=True,
|
965 |
+
max_length=4096,
|
966 |
+
padding=True
|
967 |
)
|
968 |
|
969 |
# Déplacement vers le device
|
970 |
inputs = {k: v.to(self.generation_device) for k, v in inputs.items()}
|
971 |
|
972 |
+
# Génération avec paramètres simplifiés
|
973 |
with torch.no_grad():
|
974 |
outputs = self.generation_model.generate(
|
975 |
+
input_ids=inputs["input_ids"],
|
976 |
+
attention_mask=inputs["attention_mask"],
|
977 |
+
max_new_tokens=512,
|
978 |
temperature=0.7,
|
979 |
do_sample=True,
|
980 |
+
pad_token_id=self.generation_tokenizer.pad_token_id,
|
981 |
eos_token_id=self.generation_tokenizer.eos_token_id,
|
982 |
+
use_cache=True
|
983 |
)
|
984 |
|
985 |
# Décodage de la réponse
|