Spaces:

JanviMl
/

toxic-comment-classifier

Paused

App Files Files Community

toxic-comment-classifier / paraphraser.py

JanviMl

Update paraphraser.py

7fa4f70 verified 6 months ago

raw

history blame

2.82 kB

	# paraphraser.py
	from model_loader import paraphraser_model

	def paraphrase_comment(comment):
	"""
	Paraphrase a toxic comment using the Granite 3.2-2B-Instruct model.
	Returns the paraphrased comment.
	"""
	if not comment:
	return None

	try:
	model = paraphraser_model.model
	tokenizer = paraphraser_model.tokenizer

	# Create a detailed prompt with guidelines and examples
	prompt = (
	"You are a content moderator tasked with rewriting toxic comments into neutral and constructive ones while maintaining the original meaning. "
	"Follow these guidelines:\n"
	"- Remove explicit hate speech, personal attacks, or offensive language.\n"
	"- Keep the response neutral and conversational, suitable for a casual online platform.\n"
	"- Ensure the rewritten comment retains the original intent but in a constructive tone, addressing the specific context of the comment (e.g., disagreement, frustration).\n\n"
	"Examples:\n"
	"Toxic: \"You're so dumb! You never understand anything!\"\n"
	"Neutral: \"I think there might be a misunderstanding here. Can we go over this again to clear things up?\"\n"
	"Toxic: \"This is the worst idea ever. Only an idiot would suggest this.\"\n"
	"Neutral: \"I’m not sure this idea works for me. Could we look at some other options instead?\"\n"
	"Toxic: \"You are an idiot and should leave this platform.\"\n"
	"Neutral: \"It seems like you might not be enjoying this platform. Maybe we can talk about what’s not working for you?\"\n\n"
	f"Now, rewrite this comment: \"{comment}\""
	)
	inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512)

	# Generate the paraphrased comment with optimized parameters
	outputs = model.generate(
	**inputs,
	max_new_tokens=50, # Specify the number of new tokens to generate (excludes input length)
	num_beams=4, # Use beam search for faster and more consistent generation
	early_stopping=True, # Stop generation once a good sequence is found
	do_sample=False # Disable sampling to use beam search
	)

	paraphrased_comment = tokenizer.decode(outputs[0], skip_special_tokens=True)
	# Remove the prompt part from the output
	paraphrased_comment = paraphrased_comment.replace(prompt, "").strip()
	# Remove unwanted prefixes like "Neutral: "
	if paraphrased_comment.startswith("Neutral: "):
	paraphrased_comment = paraphrased_comment[len("Neutral: "):].strip()
	return paraphrased_comment

	except Exception as e:
	return f"Error paraphrasing comment: {str(e)}"