Spaces:

wandb
/

guardrails-genie

Runtime error

App Files Files Community

guardrails-genie / guardrails_genie /guardrails /manager.py

geekyrakshit

add: docs for prompt injection guardrails

b207b4c 11 months ago

raw

history blame

3.98 kB

	import weave
	from pydantic import BaseModel
	from rich.progress import track

	from .base import Guardrail


	class GuardrailManager(weave.Model):
	"""
	GuardrailManager is responsible for managing and executing a series of guardrails
	on a given prompt. It utilizes the `weave` framework to define operations that
	can be applied to the guardrails.

	Attributes:
	guardrails (list[Guardrail]): A list of Guardrail objects that define the
	rules and checks to be applied to the input prompt.
	"""

	guardrails: list[Guardrail]

	@weave.op()
	def guard(self, prompt: str, progress_bar: bool = True, **kwargs) -> dict:
	"""
	Execute a series of guardrails on a given prompt and return the results.

	This method iterates over a list of Guardrail objects, applying each guardrail's
	`guard` method to the provided prompt. It collects responses from each guardrail
	and compiles them into a summary report. The function also determines the overall
	safety of the prompt based on the responses from the guardrails.

	Args:
	prompt (str): The input prompt to be evaluated by the guardrails.
	progress_bar (bool, optional): If True, displays a progress bar while
	processing the guardrails. Defaults to True.
	**kwargs: Additional keyword arguments to be passed to each guardrail's
	`guard` method.

	Returns:
	dict: A dictionary containing:
	- "safe" (bool): Indicates whether the prompt is considered safe
	based on the guardrails' evaluations.
	- "alerts" (list): A list of dictionaries, each containing the name
	of the guardrail and its response.
	- "summary" (str): A formatted string summarizing the results of
	each guardrail's evaluation.
	"""
	alerts, summaries, safe = [], "", True
	iterable = (
	track(self.guardrails, description="Running guardrails")
	if progress_bar
	else self.guardrails
	)
	for guardrail in iterable:
	response = guardrail.guard(prompt, **kwargs)
	alerts.append(
	{"guardrail_name": guardrail.__class__.__name__, "response": response}
	)
	if isinstance(response, BaseModel):
	safe = safe and response.safe
	summaries += f"{guardrail.__class__.__name__}: {response.explanation}\n\n---\n\n"
	else:
	safe = safe and response["safe"]
	summaries += f"{guardrail.__class__.__name__}: {response['summary']}\n\n---\n\n"
	return {"safe": safe, "alerts": alerts, "summary": summaries}

	@weave.op()
	def predict(self, prompt: str, **kwargs) -> dict:
	"""
	Predicts the safety and potential issues of a given input prompt using the guardrails.

	This function serves as a wrapper around the `guard` method, providing a simplified
	interface for evaluating the input prompt without displaying a progress bar. It
	applies a series of guardrails to the prompt and returns a detailed assessment.

	Args:
	prompt (str): The input prompt to be evaluated by the guardrails.
	**kwargs: Additional keyword arguments to be passed to each guardrail's
	`guard` method.

	Returns:
	dict: A dictionary containing:
	- "safe" (bool): Indicates whether the prompt is considered safe
	based on the guardrails' evaluations.
	- "alerts" (list): A list of dictionaries, each containing the name
	of the guardrail and its response.
	- "summary" (str): A formatted string summarizing the results of
	each guardrail's evaluation.
	"""
	return self.guard(prompt, progress_bar=False, **kwargs)