import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from unsloth import FastLanguageModel max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False. model, tokenizer = FastLanguageModel.from_pretrained( model_name = "xiddiqui/News_Summarizer", max_seq_length = max_seq_length, dtype = dtype, load_in_4bit = load_in_4bit, # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf ) # Define the summarization function def generate_summary(input_text): alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: Summarize the following: ### Input: {} ### Response: """ # Tokenize and prepare the input text inputs = tokenizer( [alpaca_prompt.format(input_text)], return_tensors="pt", truncation=True, max_length=max_seq_length ).to("cpu") # Ensure computations are done on CPU # Set up TextStreamer for efficient text generation from transformers import TextStreamer text_streamer = TextStreamer(tokenizer) # Generate summary summary_ids = model.generate( **inputs, streamer=text_streamer, max_new_tokens=64 # Limit the length of the output ) # Decode the output summary summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary # Set up Gradio interface iface = gr.Interface( fn=generate_summary, # Function to be called on user input inputs="text", # Single text box for user input outputs="text", # Output as text live=True, # Optional: updates summary as user types title="News Summarizer", # Title of the app description="Enter a news article, and get a concise summary of the content." ) # Launch Gradio app iface.launch()