Spaces:

jairwaal
/

anonimiseren

Sleeping

App Files Files Community

jairwaal commited on Jan 30, 2024

Commit

9feb43b

verified ·

1 Parent(s): 28f3e40

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -9

app.py CHANGED Viewed

@@ -1,18 +1,123 @@
 import os
-import requests
-github_pat = os.environ['github_pat']
-raw_url = f'https://{github_pat}@raw.githubusercontent.com/waaljair/testgradio/main/run.py'
-response = requests.get(raw_url)
-if response.status_code == 200:
-    exec(response.text)
-else:
-    raise Exception(f"Failed to fetch the Python file from the repository. Status code: {response.status_code}")
 anonymizer = Anonimiseren()
-print(anonymizer)

+import gradio as gr
+from transformers import AutoTokenizer, AutoModel
+import torch
+import numpy as np
+import faiss
+import random
+from tqdm import tqdm
 import os
+runclass = os.environ['anonclass']
+"""
+class Anonimiseren:
+    def __init__(self, model_name="CLTL/MedRoBERTa.nl"):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        self._prepare_vocab_index()
+    def _prepare_vocab_index(self):
+        vocab_embeddings = self.model.get_input_embeddings().weight.data.numpy()
+        self.vocab_index = faiss.IndexFlatL2(vocab_embeddings.shape[1])
+        self.vocab_index.add(vocab_embeddings)
+    def get_closest_token_id(self, embedding, original_token_id, desired_length=3):
+        D, I = self.vocab_index.search(np.array([embedding]), desired_length + 10)
+        exclude_token_ids = [self.tokenizer.cls_token_id, self.tokenizer.sep_token_id, self.tokenizer.pad_token_id, self.tokenizer.mask_token_id]
+        closest_token_ids = [token_id for token_id in I[0] if token_id != original_token_id and token_id not in exclude_token_ids]
+        idx = 0
+        while len(closest_token_ids) < desired_length and idx < len(I[0]):
+            token_id = I[0][idx]
+            if token_id not in closest_token_ids and token_id != original_token_id and token_id not in exclude_token_ids:
+                closest_token_ids.append(token_id)
+            idx += 1
+        return random.choice(closest_token_ids[:desired_length])
+    def process_sentence(self, text, desired_length=1):
+        encoded_input = self.tokenizer.encode_plus(text, add_special_tokens=True, max_length=256, padding='max_length', return_tensors='pt', truncation=True)
+        attention_masks = encoded_input['attention_mask'].squeeze().tolist()
+        with torch.no_grad():
+            model_output = self.model(**encoded_input)
+        last_layer_embeddings = model_output.last_hidden_state.squeeze(0).numpy()
+        original_token_ids = encoded_input['input_ids'].squeeze().tolist()
+        new_token_ids = []
+        for i, embedding in enumerate(last_layer_embeddings):
+            token_id = original_token_ids[i]
+            if token_id in [self.tokenizer.cls_token_id, self.tokenizer.sep_token_id, self.tokenizer.pad_token_id, self.tokenizer.mask_token_id] or attention_masks[i] == 0:
+                new_token_ids.append(token_id)
+            else:
+                closest_token_id = self.get_closest_token_id(embedding, token_id, desired_length)
+                new_token_ids.append(closest_token_id)
+        new_sentence = self.tokenizer.decode(new_token_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        return new_sentence.strip(), original_token_ids, new_token_ids, attention_masks
 anonymizer = Anonimiseren()
+"""
+exec(runclass)
+batch_size = 4
+def process_batch(sentences, anonymizer, desired_length):
+    batch_results = []
+    batch_original_token_ids = []
+    batch_new_token_ids = []
+    batch_attention_masks = []
+    for sentence in sentences:
+        new_sentence, original_token_ids, new_token_ids, attention_masks = anonymizer.process_sentence(sentence, desired_length)
+        batch_results.append(new_sentence)
+        batch_original_token_ids.append(original_token_ids)
+        batch_new_token_ids.append(new_token_ids)
+        batch_attention_masks.append(attention_masks)
+    return batch_results, batch_original_token_ids, batch_new_token_ids, batch_attention_masks
+def anonymize_texts(text_list, desired_length, output_type):
+    desired_length = int(desired_length)
+    sentences = text_list.split('\n')
+    new_sentences, all_original_token_ids, all_new_token_ids, all_attention_masks = [], [], [], []
+    for i in tqdm(range(0, len(sentences), batch_size), desc="Processing batches"):
+        batch = sentences[i:i + batch_size]
+        batch_results, batch_original_ids, batch_new_ids, batch_attention_masks = process_batch(batch, anonymizer, desired_length)
+        new_sentences.extend(batch_results)
+        all_original_token_ids.extend(batch_original_ids)
+        all_new_token_ids.extend(batch_new_ids)
+        all_attention_masks.extend(batch_attention_masks)
+        del batch_results, batch_original_ids, batch_new_ids, batch_attention_masks
+    if output_type == "New Sentences":
+        return "\n".join(new_sentences)
+    elif output_type == "Token IDs":
+        return "\n".join([str(ids) for ids in all_new_token_ids])
+    elif output_type == "Attention Masks":
+        return "\n".join([str(masks) for masks in all_attention_masks])
+    elif output_type == "Token IDs & Attention Masks":
+        combined_output = []
+        for token_ids, masks in zip(all_new_token_ids, all_attention_masks):
+            combined_output.append(f"Token IDs: {token_ids}\nAttention Masks: {masks}\n")
+        return "\n".join(combined_output)
+interface = gr.Interface(
+    fn=anonymize_texts,
+    inputs=[
+        gr.TextArea(label="Input Text"),
+        gr.Number(label="Desired Length"),
+        gr.Dropdown(choices=["New Sentences", "Token IDs", "Attention Masks", "Token IDs & Attention Masks"], label="Output Type")
+    ],
+    outputs=gr.TextArea(label="Output Text"),
+    title="Anonymizer",
+    description="Enter multiple sentences (one per line), select the amount of tokens for anonymization, and choose the output type. Note: must be in Dutch."
+)
+interface.launch()