| import torch | |
| import os | |
| import numpy as np | |
| import gradio as gr | |
| import pytorch_lightning as pl | |
| from torch.utils.data import Dataset, DataLoader | |
| from datasets import load_dataset | |
| from pytorch_lightning.callbacks import ModelCheckpoint | |
| from pytorch_lightning.loggers import TensorBoardLogger | |
| from datasets.dataset_dict import DatasetDict | |
| from transformers import AdamW, T5ForConditionalGeneration, T5TokenizerFast | |
| from tqdm.auto import tqdm | |
| from summarizer import SummarizerModel | |
| from transformers import AutoTokenizer | |
| from sentence_transformers import SentenceTransformer | |
| import warnings | |
| warnings.simplefilter('ignore') | |
| MODEL_NAME = 'Salesforce/codet5-base-multi-sum' | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| model = SummarizerModel(MODEL_NAME) | |
| model.load_state_dict(torch.load('codet5-base-1_epoch-val_loss-0.80.pth')) | |
| embedding_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| def summarize(text: str, | |
| tokenizer = tokenizer, | |
| trained_model = model): | |
| """ | |
| Summarizes a given code in text format. | |
| Args: | |
| text: The code in string format that needs to be summarized. | |
| tokenizer: The tokeniszer used in the trained T5 model. | |
| trained_model: A SummarizerModel fine-tuned instance of | |
| T5 model family. | |
| """ | |
| text_encoding = tokenizer.encode_plus( | |
| text, | |
| padding = 'max_length', | |
| max_length = 512, | |
| add_special_tokens = True, | |
| return_attention_mask = True, | |
| truncation = True, | |
| return_tensors = 'pt' | |
| ) | |
| generated_ids = trained_model.model.generate( | |
| input_ids = text_encoding['input_ids'], | |
| attention_mask = text_encoding['attention_mask'], | |
| max_length = 150, | |
| num_beams = 2, | |
| repetition_penalty = 2.5, | |
| length_penalty = 1.0, | |
| early_stopping = True | |
| ) | |
| preds = [tokenizer.decode(gen_id, skip_special_tokens = True, | |
| clean_up_tokenization_spaces=True) | |
| for gen_id in generated_ids] | |
| return "".join(preds) | |
| def find_similarity_score(code_1, code_2, model = embedding_model): | |
| summary_code_1 = summarize(text = code_1) | |
| summary_code_2 = summarize(text = code_2) | |
| embedding_1 = model.encode(summary_code_1) | |
| embedding_2 = model.encode(summary_code_2) | |
| score = np.dot(embedding_1, embedding_2)/(np.linalg.norm(embedding_1) * np.linalg.norm(embedding_2)) | |
| return summary_code_1, summary_code_2, round(score, 2) | |
| outputs = gr.outputs.Textbox() | |
| iface = gr.Interface(fn=find_similarity_score, | |
| inputs=[gr.Textbox(label = 'First Code snippet'), | |
| gr.Textbox(label = 'Second Code snippet')], | |
| outputs=[gr.Textbox(label = 'Summary of first Code snippet'), | |
| gr.Textbox(label = 'Summary of second Code snippet'), | |
| gr.Textbox(label = 'The similarity score')], | |
| description='The similarity score') | |
| iface.launch() |