Spaces:

ewan-rvl
/

toxicity-detector

Sleeping

App Files Files Community

toxicity-detector / app.py

ewan-rvl

First true version

27b9ad2 7 months ago

raw

history blame

2.44 kB

	import os
	import re
	import logging
	import nltk
	import torch
	import gradio as gr
	from transformers import pipeline, AutoConfig
	from nltk.tokenize import word_tokenize
	from nltk.stem import WordNetLemmatizer
	from textblob import TextBlob

	# Configuration du logging
	logging.basicConfig(level=logging.DEBUG)

	# Vérifier la disponibilité du GPU
	device = 0 if torch.cuda.is_available() else -1

	# Charger le modèle et sa configuration
	model_name = "AgentPublic/camembert-base-toxic-fr-user-prompts"
	config = AutoConfig.from_pretrained(model_name)
	classifier = pipeline('text-classification', model=model_name, device=device)

	# Chargement des ressources NLTK
	nltk.download('punkt')
	try:
	nltk.data.find('corpora/wordnet')
	except LookupError:
	nltk.download('wordnet')

	lemmatizer = WordNetLemmatizer()
	insult_words = [
	"con", "cons", "connard", "connards", "enculé", "enculés",
	"pute", "putes", "putain", "merde", "idiot"
	]
	insult_pattern = re.compile(r'\b(?:' + '\|'.join(insult_words) + r')\b', re.IGNORECASE)

	def analyze_text(text, threshold=0.5):
	"""
	Analyse un texte pour détecter la toxicité avec un seuil de confiance.
	Retourne True si la toxicité détectée est supérieure ou égale au seuil.
	"""
	result = classifier(text, truncation=True)[0]
	label_map = {v: k for k, v in config.label2id.items()}
	toxic_label = label_map.get(1, "toxic") # Sécurisation de l'accès
	logging.debug(f"Texte: {text} -> Score: {result['score']}")
	return result['label'] == toxic_label and result['score'] >= threshold

	def detect_toxicity(message):
	"""
	Vérifie si un message est toxique selon l'IA et les règles heuristiques.
	"""
	words = [lemmatizer.lemmatize(word) for word in word_tokenize(message.lower())]
	blob = TextBlob(" ".join(words))
	sentiment = blob.sentiment.polarity

	# Conversion en booléen pour éviter de renvoyer un objet re.Match
	return analyze_text(message) or bool(insult_pattern.search(message)) or sentiment < -0.5

	def predict(text):
	is_toxic = detect_toxicity(text)
	return f"Is toxic: {is_toxic}"

	# Création de l'interface Gradio
	iface = gr.Interface(
	fn=predict,
	inputs=gr.inputs.Textbox(lines=5, label="Texte en français"),
	outputs="text",
	title="Détecteur de Toxicité",
	description="Entrez un texte en français pour vérifier s'il est toxique."
	)

	if __name__ == "__main__":
	iface.launch()