import numpy as np import pandas as pd import re import torch import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base") model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base") tokenizer_gen_title = AutoTokenizer.from_pretrained("Ateeqq/news-title-generator") model_gen_title = AutoModelForSeq2SeqLM.from_pretrained("Ateeqq/news-title-generator") def generate_title(input_text): #Generate a title for input text with Ateeq model input_ids = tokenizer_gen_title.encode(input_text, return_tensors="pt") #Tokenize input text #input_ids = input_ids.to('cuda') #Send tokenized inputs to gpu output = model_gen_title.generate(input_ids, max_new_tokens=100, do_sample=True, temperature=0.8, top_k = 20 ) decoded_text = tokenizer_gen_title.decode(output[0], skip_special_tokens=True) return decoded_text def split_into_sentences(paragraph): #For paraphraser - return a list of sentences from input para # Split sentences after period. Retains \n if part of the text, but not included in model output sentence_endings = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s' sentences = re.split(sentence_endings, paragraph) return sentences def paraphrase( text, beam_search, #num_beams=10, #num_beam_groups=10, #num_return_sequences=1, #repetition_penalty=1.0, #diversity_penalty=1.0, #no_repeat_ngram_size=3, temperature=0.8, max_length=128 ): if text != "": sentence_list = split_into_sentences(text) #feed input para into sentence splitter output = [] #List to hold the individual rephrased sentences obtained from the model for sentence in sentence_list: input_ids = tokenizer( f'paraphrase: {sentence}', #Using paraphrase prompt for T5 return_tensors="pt", padding="longest", #max_length=max_length, #truncation=True, ).input_ids outputs = model.generate( input_ids, do_sample=True, num_beams = 20 if beam_search else 1, temperature=temperature, max_length=max_length, no_repeat_ngram_size=4 ) res = tokenizer.batch_decode(outputs, skip_special_tokens=True) output.append(res[0]) #Add rephrased sentence to list paraphrased_text = "" #to hold the combined sentence output made from generated list titles_list = "" #to hold the three titles for sentence in output: #Join all new reworded sentences together paraphrased_text += sentence + " " for title in range (1,4): #Print 3 titles by calling Ateeq model fn - generate_title titles_list += (f"Title {title}: {generate_title (paraphrased_text)}<br>") #titles_list.append ("") #space after each title return (titles_list, paraphrased_text) # Return paraphrased text after printing three titles above iface = gr.Interface(fn=paraphrase, inputs=[gr.Textbox(label="Paste text in the input box and press 'Submit'.", lines=10), "checkbox", gr.Slider(0.1, 2, 0.8)], outputs=[gr.HTML(label="Titles:"), gr.Textbox(label="Rephrased text:", lines=15)], title="AI Paraphraser with Title Generator", description="Sentencet-to-sentence rewording backed with GPT-3.5 training set", article="<div align=left><h1>AI Paraphraser and Title Generator</h1><li>Each sentence is rephrased separately without context.</li><li>Temperature: Increase value for more creative rewordings. Higher values may corrupt the sentence. Reset value after pressing 'Clear'</li><li>Beam search: Try for safer and conservative rephrasing.</li><p>Models:<br><li>Training set derived by using Chat-GPT3.5. No competition intended.</li><li>Original models: humarin/chatgpt_paraphraser_on_T5_base and Ateeq_news_title_generator. Deployment code modified for long text inputs.</li></p><p>Parameter details:<br><li>For rephraser: Beam search: No. of beams = 20, no_repeat_ngram_size=4, do_sample=True.</li><li>For title generator: do_sample=True, temperature=0.8, top_k = 20 </li></div>", flagging_mode='never' ) iface.launch()