import gradio as gr from transformers import AutoTokenizer, AutoModel import torch import numpy as np import faiss import random from tqdm import tqdm import os passw = os.environ['passw'] runclass = os.environ['anonclass'] runclass_exc = runclass.strip('"""') exec(runclass_exc) batch_size = 4 def process_batch(sentences, anonymizer, desired_length): batch_results = [] batch_original_token_ids = [] batch_new_token_ids = [] batch_attention_masks = [] for sentence in sentences: new_sentence, original_token_ids, new_token_ids, attention_masks = anonymizer.process_sentence(sentence, desired_length) batch_results.append(new_sentence) batch_original_token_ids.append(original_token_ids) batch_new_token_ids.append(new_token_ids) batch_attention_masks.append(attention_masks) return batch_results, batch_original_token_ids, batch_new_token_ids, batch_attention_masks def anonymize_texts(text_list, desired_length, output_type): desired_length = int(desired_length) sentences = text_list.split('\n') new_sentences, all_original_token_ids, all_new_token_ids, all_attention_masks = [], [], [], [] for i in tqdm(range(0, len(sentences), batch_size), desc="Processing batches"): batch = sentences[i:i + batch_size] batch_results, batch_original_ids, batch_new_ids, batch_attention_masks = process_batch(batch, anonymizer, desired_length) new_sentences.extend(batch_results) all_original_token_ids.extend(batch_original_ids) all_new_token_ids.extend(batch_new_ids) all_attention_masks.extend(batch_attention_masks) del batch_results, batch_original_ids, batch_new_ids, batch_attention_masks if output_type == "New Sentences": return "\n".join(new_sentences) elif output_type == "Token IDs": return "\n".join([str(ids) for ids in all_new_token_ids]) elif output_type == "Attention Masks": return "\n".join([str(masks) for masks in all_attention_masks]) elif output_type == "Token IDs & Attention Masks": combined_output = [] for token_ids, masks in zip(all_new_token_ids, all_attention_masks): combined_output.append(f"Token IDs: {token_ids}\nAttention Masks: {masks}\n") return "\n".join(combined_output) interface = gr.Interface( fn=anonymize_texts, inputs=[ gr.TextArea(label="Input Text"), gr.Number(label="Desired Length"), gr.Dropdown(choices=["New Sentences", "Token IDs", "Attention Masks", "Token IDs & Attention Masks"], label="Output Type") ], outputs=gr.TextArea(label="Output Text"), title="Anonymizer", description="Enter multiple sentences (one per line), select the amount of tokens for anonymization, and choose the output type. Note: must be in Dutch." ) interface.launch(auth=(passw, passw))