Spaces:

girishwangikar
/

PDF_And_Text_Summarizer

Running

File size: 3,892 Bytes

a0f19e1

import os
from google.colab import userdata
import gradio as gr
from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
import PyPDF2
from langchain.prompts import PromptTemplate

# Set up API keys
hf_api_key = userdata.get('HF_TOKEN')
groq_api_key = userdata.get('GROQ_API_KEY')
os.environ['HF_TOKEN'] = hf_api_key
os.environ['GROQ_API_KEY'] = groq_api_key

# Set up LLM
llm = ChatGroq(temperature=0, model_name='llama-3.1-8b-instant', groq_api_key=groq_api_key)
def extract_text_from_pdf(pdf_file):
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

def chunk_text(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=4000,  
        chunk_overlap=400,  
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return [Document(page_content=chunk) for chunk in chunks]

def summarize_chunks(chunks):
    # Prompt for the initial summarization of each chunk
    map_prompt_template = """Write a detailed summary of the following text:
    "{text}"
    DETAILED SUMMARY:"""
    map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["text"])
    
    # Prompt for combining the summaries
    combine_prompt_template = """Write a comprehensive summary of the following text, capturing key points and main ideas:
    "{text}"
    COMPREHENSIVE SUMMARY:"""
    combine_prompt = PromptTemplate(template=combine_prompt_template, input_variables=["text"])
    
    # Check the total length of the chunks
    total_length = sum(len(chunk.page_content) for chunk in chunks)
    
    if total_length < 10000:  # For shorter documents
        chain = load_summarize_chain(
            llm, 
            chain_type="stuff", 
            prompt=combine_prompt
        )
    else:  # For longer documents
        chain = load_summarize_chain(
            llm,
            chain_type="map_reduce",
            map_prompt=map_prompt,
            combine_prompt=combine_prompt,
            verbose=True
        )
    
    summary = chain.run(chunks)
    return summary

def summarize_content(pdf_file, text_input):
    if pdf_file is None and not text_input:
        return "Please upload a PDF file or enter text to summarize."
    
    if pdf_file is not None:
        # Extract text from PDF
        text = extract_text_from_pdf(pdf_file)
    else:
        # Use the input text
        text = text_input
    
    # Chunk the text
    chunks = chunk_text(text)
    
    # Summarize chunks
    final_summary = summarize_chunks(chunks)
    return final_summary

with gr.Blocks(theme=gr.themes.Soft()) as iface:
    gr.Markdown(
    """
    # PDF And Text Summarizer
    ### Advanced PDF and Text Summarization -
    
    Upload your PDF document or enter text directly, and let AI generate a concise, informative summary.
    """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            input_pdf = gr.File(label="Upload PDF (optional)", file_types=[".pdf"])
            input_text = gr.Textbox(label="Or enter text here", lines=5, placeholder="Paste or type your text here...")
            submit_btn = gr.Button("Generate Summary", variant="primary")
        
        with gr.Column(scale=2):
            output = gr.Textbox(label="Generated Summary", lines=10)
    
    gr.Markdown(
    """
    ### How it works
    1. Upload a PDF file or enter text directly
    2. Click "Generate Summary"
    3. Wait for the AI to process and summarize your content
    4. Review the generated summary
    
    *Powered by LLAMA 3.1 8B model and LangChain*
    """
    )
    
    submit_btn.click(summarize_content, inputs=[input_pdf, input_text], outputs=output)

iface.launch()