Spaces:

CarolXia
/

kd-finetune

Sleeping

App Files Files Community

CarolXia commited on Nov 26, 2024

Commit

5418ef7

1 Parent(s): 75fffdd

initial commit

Browse files

Files changed (3) hide show

README.md +6 -6
app.py +212 -0
requirements.txt +14 -0

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
 ---
-title: Kd Finetune
-emoji: 📉
-colorFrom: green
-colorTo: yellow
 sdk: streamlit
-sdk_version: 1.40.2
 app_file: app.py
 pinned: false
-license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Traceforce
+emoji: 🦀
+colorFrom: blue
+colorTo: gray
 sdk: streamlit
+sdk_version: 1.40.1
 app_file: app.py
 pinned: false
+license: other
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import streamlit as st
+# from gliner import GLiNER
+from datasets import load_dataset
+import evaluate
+import numpy as np
+import threading
+import time
+from peft import prepare_model_for_kbit_training
+from peft import LoraConfig, get_peft_model, TaskType
+import torch
+from torch.profiler import profile, record_function, ProfilerActivity
+from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, Trainer, TrainingArguments
+seqeval = evaluate.load("seqeval")
+# id2label = {0: "O"}
+# label2id = {"O": 0}
+# def build_id2label(examples):
+#     for i, label in enumerate(examples["mbert_token_classes"]):
+#         if label.startswith("I-") and label not in label2id:
+#             current_len = len(id2label)
+#             id2label[current_len] = label
+#             label2id[label] = current_len
+print(f"Is CUDA available: {torch.cuda.is_available()}")
+# True
+if torch.cuda.is_available():
+    print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
+# Load the fine-tuned GLiNER model
+st.write('Loading the pretrained model ...')
+model_name = "iiiorg/piiranha-v1-detect-personal-information"
+model = AutoModelForTokenClassification.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+print(model)
+# Prepare model for LoRA training
+model.train() # model in evaluation mode (dropout modules are activated)
+# enable gradient check pointing
+model.gradient_checkpointing_enable()
+# enable quantized training
+model = prepare_model_for_kbit_training(model)
+# LoRA config
+config = LoraConfig(
+    r=8,
+    lora_alpha=32,
+    target_modules=["query_proj"],
+    lora_dropout=0.05,
+    bias="none",
+    task_type=TaskType.TOKEN_CLS
+)
+# LoRA trainable version of model
+model = get_peft_model(model, config)
+print(model)
+# trainable parameter count
+model.print_trainable_parameters()
+# # print weights
+# pytorch_total_params = sum(p.numel() for p in model.parameters())
+# torch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+# print(f'total params: {pytorch_total_params}. tunable params: {torch_total_params}')
+if torch.cuda.is_available():
+    model = model.to("cuda")
+# Load data.
+raw_dataset = load_dataset("ai4privacy/pii-masking-400k", split='train[1:1000]')
+# raw_dataset = raw_dataset.filter(lambda example: example["language"].startswith("en"))
+raw_dataset = raw_dataset.train_test_split(test_size=0.2)
+print(raw_dataset)
+print(raw_dataset.column_names)
+# raw_dataset = raw_dataset.select_columns(["mbert_tokens"])
+# raw_dataset = raw_dataset.rename_column("mbert_tokens", "tokens")
+# raw_dataset = raw_dataset.rename_column("mbert_token_classes", "labels")
+# inputs = tokenizer(
+#     raw_dataset['train'][0]['mbert_tokens'],
+#     truncation=True,
+#     is_split_into_words=True)
+# print(inputs)
+# print(inputs.tokens())
+# print(inputs.word_ids())
+# Build label2id and id2label
+st.write("Building label mappings")
+label2id = model.config.label2id
+id2label = model.config.id2label
+# raw_dataset.map(
+#     build_id2label,
+#     batched=False)
+st.write("id2label: ", model.config.id2label)
+st.write("label2id: ", model.config.label2id)
+# function to align labels with tokens
+# --> special tokens: -100 label id (ignored by cross entropy),
+# --> if tokens are inside a word, replace 'B-' with 'I-'
+def align_labels_with_tokens(labels):
+    aligned_label_ids = []
+    aligned_label_ids.append(-100)
+    for i, label in enumerate(labels):
+        if label.startswith("B-"):
+            label = label.replace("B-", "I-")
+        aligned_label_ids.append(label2id[label])
+    aligned_label_ids.append(-100)
+    return aligned_label_ids
+# create tokenize function
+def tokenize_function(examples):
+    # tokenize and truncate text. The examples argument would have already stripped
+    # the train or test label.
+    new_labels = []
+    inputs = tokenizer(
+        examples['mbert_tokens'],
+        is_split_into_words=True,
+        padding=True,
+        truncation=True,
+        max_length=512)
+    for _, labels in enumerate(examples['mbert_token_classes']):
+        new_labels.append(align_labels_with_tokens(labels))
+    inputs["labels"] = new_labels
+    return inputs
+# tokenize training and validation datasets
+tokenized_data = raw_dataset.map(
+    tokenize_function,
+    batched=True)
+# data collator
+data_collator = DataCollatorForTokenClassification(tokenizer)
+st.write(tokenized_data["train"][:2]["labels"])
+import os
+# Print all CUDA environment variables
+for key, value in os.environ.items():
+    if "CUDA" in key.upper():
+        print(f"{key}={value}")
+def compute_metrics(eval_preds):
+    logits, labels = eval_preds
+    predictions = np.argmax(logits, axis=-1)
+    # Remove ignored index (special tokens) and convert to labels
+    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
+    true_predictions = [
+        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
+        for prediction, label in zip(predictions, labels)
+    ]
+    all_metrics = seqeval.compute(predictions=true_predictions, references=true_labels)
+    return {
+        "precision": all_metrics["overall_precision"],
+        "recall": all_metrics["overall_recall"],
+        "f1": all_metrics["overall_f1"],
+        "accuracy": all_metrics["overall_accuracy"],
+    }
+# hyperparameters
+lr = 2e-4
+batch_size = 4
+num_epochs = 4
+output_dir = "xia-lora-deberta-v2"
+# define training arguments
+training_args = TrainingArguments(
+    output_dir= output_dir,
+    learning_rate=lr,
+    per_device_train_batch_size=batch_size,
+    per_device_eval_batch_size=batch_size,
+    num_train_epochs=num_epochs,
+    weight_decay=0.01,
+    logging_strategy="epoch",
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    gradient_accumulation_steps=4,
+    warmup_steps=2,
+    fp16=True,
+    optim="paged_adamw_8bit",
+)
+# configure trainer
+trainer = Trainer(
+    model=model,
+    train_dataset=tokenized_data["train"],
+    eval_dataset=tokenized_data["test"],
+    args=training_args,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics
+)
+# train model
+model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
+trainer.train()
+# renable warnings
+model.config.use_cache = True
+st.write('Pushing model to huggingface')
+# Push model to huggingface
+hf_name = 'CarolXia' # your hf username or org name
+model_id = hf_name + "/" + output_dir
+model.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])
+trainer.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+auto-gptq
+bitsandbytes
+datasets
+evaluate
+seqeval
+gliner
+torch>=2.0.0
+transformers>=4.38.2
+huggingface_hub>=0.21.4
+onnxruntime
+optimum
+peft
+sentencepiece
+tqdm