Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import AutoTokenizer, AutoModel | |
import torch | |
import numpy as np | |
import faiss | |
import random | |
from tqdm import tqdm | |
import os | |
passw = os.environ['passw'] | |
runclass = os.environ['anonclass'] | |
runclass_exc = runclass.strip('"""') | |
exec(runclass_exc) | |
batch_size = 4 | |
def process_batch(sentences, anonymizer, desired_length): | |
batch_results = [] | |
batch_original_token_ids = [] | |
batch_new_token_ids = [] | |
batch_attention_masks = [] | |
for sentence in sentences: | |
new_sentence, original_token_ids, new_token_ids, attention_masks = anonymizer.process_sentence(sentence, desired_length) | |
batch_results.append(new_sentence) | |
batch_original_token_ids.append(original_token_ids) | |
batch_new_token_ids.append(new_token_ids) | |
batch_attention_masks.append(attention_masks) | |
return batch_results, batch_original_token_ids, batch_new_token_ids, batch_attention_masks | |
def anonymize_texts(text_list, desired_length, output_type): | |
desired_length = int(desired_length) | |
sentences = text_list.split('\n') | |
new_sentences, all_original_token_ids, all_new_token_ids, all_attention_masks = [], [], [], [] | |
for i in tqdm(range(0, len(sentences), batch_size), desc="Processing batches"): | |
batch = sentences[i:i + batch_size] | |
batch_results, batch_original_ids, batch_new_ids, batch_attention_masks = process_batch(batch, anonymizer, desired_length) | |
new_sentences.extend(batch_results) | |
all_original_token_ids.extend(batch_original_ids) | |
all_new_token_ids.extend(batch_new_ids) | |
all_attention_masks.extend(batch_attention_masks) | |
del batch_results, batch_original_ids, batch_new_ids, batch_attention_masks | |
if output_type == "New Sentences": | |
return "\n".join(new_sentences) | |
elif output_type == "Token IDs": | |
return "\n".join([str(ids) for ids in all_new_token_ids]) | |
elif output_type == "Attention Masks": | |
return "\n".join([str(masks) for masks in all_attention_masks]) | |
elif output_type == "Token IDs & Attention Masks": | |
combined_output = [] | |
for token_ids, masks in zip(all_new_token_ids, all_attention_masks): | |
combined_output.append(f"Token IDs: {token_ids}\nAttention Masks: {masks}\n") | |
return "\n".join(combined_output) | |
interface = gr.Interface( | |
fn=anonymize_texts, | |
inputs=[ | |
gr.TextArea(label="Input Text"), | |
gr.Number(label="Desired Length"), | |
gr.Dropdown(choices=["New Sentences", "Token IDs", "Attention Masks", "Token IDs & Attention Masks"], label="Output Type") | |
], | |
outputs=gr.TextArea(label="Output Text"), | |
title="Anonymizer", | |
description="Enter multiple sentences (one per line), select the amount of tokens for anonymization, and choose the output type. Note: must be in Dutch." | |
) | |
interface.launch(auth=(passw, passw)) |