Spaces:

jairwaal
/

anonimiseren

Sleeping

App Files Files Community

anonimiseren / app.py

jairwaal

Update app.py

bf1363a verified about 1 year ago

raw

history blame contribute delete

2.88 kB

	import gradio as gr
	from transformers import AutoTokenizer, AutoModel
	import torch
	import numpy as np
	import faiss
	import random
	from tqdm import tqdm
	import os

	passw = os.environ['passw']
	runclass = os.environ['anonclass']

	runclass_exc = runclass.strip('"""')

	exec(runclass_exc)

	batch_size = 4

	def process_batch(sentences, anonymizer, desired_length):
	batch_results = []
	batch_original_token_ids = []
	batch_new_token_ids = []
	batch_attention_masks = []
	for sentence in sentences:
	new_sentence, original_token_ids, new_token_ids, attention_masks = anonymizer.process_sentence(sentence, desired_length)
	batch_results.append(new_sentence)
	batch_original_token_ids.append(original_token_ids)
	batch_new_token_ids.append(new_token_ids)
	batch_attention_masks.append(attention_masks)
	return batch_results, batch_original_token_ids, batch_new_token_ids, batch_attention_masks

	def anonymize_texts(text_list, desired_length, output_type):
	desired_length = int(desired_length)
	sentences = text_list.split('\n')
	new_sentences, all_original_token_ids, all_new_token_ids, all_attention_masks = [], [], [], []

	for i in tqdm(range(0, len(sentences), batch_size), desc="Processing batches"):
	batch = sentences[i:i + batch_size]
	batch_results, batch_original_ids, batch_new_ids, batch_attention_masks = process_batch(batch, anonymizer, desired_length)
	new_sentences.extend(batch_results)
	all_original_token_ids.extend(batch_original_ids)
	all_new_token_ids.extend(batch_new_ids)
	all_attention_masks.extend(batch_attention_masks)
	del batch_results, batch_original_ids, batch_new_ids, batch_attention_masks


	if output_type == "New Sentences":
	return "\n".join(new_sentences)
	elif output_type == "Token IDs":
	return "\n".join([str(ids) for ids in all_new_token_ids])
	elif output_type == "Attention Masks":
	return "\n".join([str(masks) for masks in all_attention_masks])
	elif output_type == "Token IDs & Attention Masks":
	combined_output = []
	for token_ids, masks in zip(all_new_token_ids, all_attention_masks):
	combined_output.append(f"Token IDs: {token_ids}\nAttention Masks: {masks}\n")
	return "\n".join(combined_output)

	interface = gr.Interface(
	fn=anonymize_texts,
	inputs=[
	gr.TextArea(label="Input Text"),
	gr.Number(label="Desired Length"),
	gr.Dropdown(choices=["New Sentences", "Token IDs", "Attention Masks", "Token IDs & Attention Masks"], label="Output Type")
	],
	outputs=gr.TextArea(label="Output Text"),
	title="Anonymizer",
	description="Enter multiple sentences (one per line), select the amount of tokens for anonymization, and choose the output type. Note: must be in Dutch."
	)

	interface.launch(auth=(passw, passw))