File size: 2,529 Bytes
91200a4
78f89b2
74ecf43
1d523f1
aa11130
78f89b2
135d921
78f89b2
 
 
 
1671d1f
 
74ecf43
78f89b2
74ecf43
adfe61d
 
74ecf43
1d523f1
 
 
74ecf43
 
 
 
 
1671d1f
74ecf43
aa11130
 
 
 
 
 
 
91200a4
aa11130
 
91200a4
aa11130
 
78f89b2
aa11130
 
 
 
 
 
1671d1f
aa11130
 
 
 
78f89b2
 
 
 
aa11130
 
 
 
91200a4
 
aa11130
 
 
 
 
 
 
 
91200a4
 
aa11130
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

# Set the model name and parameters
model_name = "xiddiqui/News_Summarizer"
max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = None  # None for auto detection
load_in_4bit = False  # Use False if we aren't using 4bit quantization

# Set the device to CPU explicitly
device = "cpu"

# Load model and tokenizer

# 1. Load the base model (unsloth/meta-llama-3.1-8b)
base_model_name = "unsloth/Meta-Llama-3.1-8B"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Disable GPU quantization or 4-bit loading explicitly to avoid bitsandbytes
model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float32)

# 2. Load your fine-tuned model with the LoRA adapter
adapter_model_name = "xiddiqui/News_Summarizer"  # Your model path on Hugging Face
model = PeftModel.from_pretrained(model, adapter_model_name)

# Move model to CPU (no need for GPU)
model.to(device)

# Define the summarization function
def generate_summary(input_text):
    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction:
    Summarize the following: 

    ### Input:
    {}

    ### Response:
    """

    # Tokenize and prepare the input text
    inputs = tokenizer(
        [alpaca_prompt.format(input_text)], 
        return_tensors="pt", 
        truncation=True, 
        max_length=max_seq_length
    ).to(device)  # Ensure computations are done on the CPU

    # Generate summary
    summary_ids = model.generate(
        **inputs, 
        max_length=128,  # Limit the length of the output
        num_beams=4,  # Set the beam search to get a better output
        no_repeat_ngram_size=2,  # Avoid repeating n-grams in the summary
        early_stopping=True
    )

    # Decode the output summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Set up Gradio interface
iface = gr.Interface(
    fn=generate_summary,  # Function to be called on user input
    inputs="text",  # Single text box for user input
    outputs="text",  # Output as text
    live=True,  # Optional: updates summary as user types
    title="News Summarizer",  # Title of the app
    description="Enter a news article, and get a concise summary of the content."
)

# Launch Gradio app
iface.launch()