File size: 3,892 Bytes
a0f19e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
from google.colab import userdata
import gradio as gr
from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
import PyPDF2
from langchain.prompts import PromptTemplate

# Set up API keys
hf_api_key = userdata.get('HF_TOKEN')
groq_api_key = userdata.get('GROQ_API_KEY')
os.environ['HF_TOKEN'] = hf_api_key
os.environ['GROQ_API_KEY'] = groq_api_key

# Set up LLM
llm = ChatGroq(temperature=0, model_name='llama-3.1-8b-instant', groq_api_key=groq_api_key)
def extract_text_from_pdf(pdf_file):
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

def chunk_text(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=4000,  
        chunk_overlap=400,  
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return [Document(page_content=chunk) for chunk in chunks]

def summarize_chunks(chunks):
    # Prompt for the initial summarization of each chunk
    map_prompt_template = """Write a detailed summary of the following text:
    "{text}"
    DETAILED SUMMARY:"""
    map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["text"])
    
    # Prompt for combining the summaries
    combine_prompt_template = """Write a comprehensive summary of the following text, capturing key points and main ideas:
    "{text}"
    COMPREHENSIVE SUMMARY:"""
    combine_prompt = PromptTemplate(template=combine_prompt_template, input_variables=["text"])
    
    # Check the total length of the chunks
    total_length = sum(len(chunk.page_content) for chunk in chunks)
    
    if total_length < 10000:  # For shorter documents
        chain = load_summarize_chain(
            llm, 
            chain_type="stuff", 
            prompt=combine_prompt
        )
    else:  # For longer documents
        chain = load_summarize_chain(
            llm,
            chain_type="map_reduce",
            map_prompt=map_prompt,
            combine_prompt=combine_prompt,
            verbose=True
        )
    
    summary = chain.run(chunks)
    return summary

def summarize_content(pdf_file, text_input):
    if pdf_file is None and not text_input:
        return "Please upload a PDF file or enter text to summarize."
    
    if pdf_file is not None:
        # Extract text from PDF
        text = extract_text_from_pdf(pdf_file)
    else:
        # Use the input text
        text = text_input
    
    # Chunk the text
    chunks = chunk_text(text)
    
    # Summarize chunks
    final_summary = summarize_chunks(chunks)
    return final_summary

with gr.Blocks(theme=gr.themes.Soft()) as iface:
    gr.Markdown(
    """
    # PDF And Text Summarizer
    ### Advanced PDF and Text Summarization -
    
    Upload your PDF document or enter text directly, and let AI generate a concise, informative summary.
    """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            input_pdf = gr.File(label="Upload PDF (optional)", file_types=[".pdf"])
            input_text = gr.Textbox(label="Or enter text here", lines=5, placeholder="Paste or type your text here...")
            submit_btn = gr.Button("Generate Summary", variant="primary")
        
        with gr.Column(scale=2):
            output = gr.Textbox(label="Generated Summary", lines=10)
    
    gr.Markdown(
    """
    ### How it works
    1. Upload a PDF file or enter text directly
    2. Click "Generate Summary"
    3. Wait for the AI to process and summarize your content
    4. Review the generated summary
    
    *Powered by LLAMA 3.1 8B model and LangChain*
    """
    )
    
    submit_btn.click(summarize_content, inputs=[input_pdf, input_text], outputs=output)

iface.launch()