anonimiseren / app.py
jairwaal's picture
Update app.py
bf1363a verified
import gradio as gr
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import faiss
import random
from tqdm import tqdm
import os
passw = os.environ['passw']
runclass = os.environ['anonclass']
runclass_exc = runclass.strip('"""')
exec(runclass_exc)
batch_size = 4
def process_batch(sentences, anonymizer, desired_length):
batch_results = []
batch_original_token_ids = []
batch_new_token_ids = []
batch_attention_masks = []
for sentence in sentences:
new_sentence, original_token_ids, new_token_ids, attention_masks = anonymizer.process_sentence(sentence, desired_length)
batch_results.append(new_sentence)
batch_original_token_ids.append(original_token_ids)
batch_new_token_ids.append(new_token_ids)
batch_attention_masks.append(attention_masks)
return batch_results, batch_original_token_ids, batch_new_token_ids, batch_attention_masks
def anonymize_texts(text_list, desired_length, output_type):
desired_length = int(desired_length)
sentences = text_list.split('\n')
new_sentences, all_original_token_ids, all_new_token_ids, all_attention_masks = [], [], [], []
for i in tqdm(range(0, len(sentences), batch_size), desc="Processing batches"):
batch = sentences[i:i + batch_size]
batch_results, batch_original_ids, batch_new_ids, batch_attention_masks = process_batch(batch, anonymizer, desired_length)
new_sentences.extend(batch_results)
all_original_token_ids.extend(batch_original_ids)
all_new_token_ids.extend(batch_new_ids)
all_attention_masks.extend(batch_attention_masks)
del batch_results, batch_original_ids, batch_new_ids, batch_attention_masks
if output_type == "New Sentences":
return "\n".join(new_sentences)
elif output_type == "Token IDs":
return "\n".join([str(ids) for ids in all_new_token_ids])
elif output_type == "Attention Masks":
return "\n".join([str(masks) for masks in all_attention_masks])
elif output_type == "Token IDs & Attention Masks":
combined_output = []
for token_ids, masks in zip(all_new_token_ids, all_attention_masks):
combined_output.append(f"Token IDs: {token_ids}\nAttention Masks: {masks}\n")
return "\n".join(combined_output)
interface = gr.Interface(
fn=anonymize_texts,
inputs=[
gr.TextArea(label="Input Text"),
gr.Number(label="Desired Length"),
gr.Dropdown(choices=["New Sentences", "Token IDs", "Attention Masks", "Token IDs & Attention Masks"], label="Output Type")
],
outputs=gr.TextArea(label="Output Text"),
title="Anonymizer",
description="Enter multiple sentences (one per line), select the amount of tokens for anonymization, and choose the output type. Note: must be in Dutch."
)
interface.launch(auth=(passw, passw))