|
|
|
|
|
|
|
__all__ = ['hub_llm', 'title', 'description', 'combine_prompt_template', 'pdf_example_1', 'pdf_example_2', 'prompt_example_1', |
|
'prompt_example_2', 'upload_file_input', 'custom_prompt_input', 'custom_chunk_input', 'chunk_size_input', |
|
'chunk_overlap_input', 'examples', 'outputs', 'iface', 'summarize'] |
|
|
|
|
|
from langchain_community.llms import HuggingFaceHub |
|
from langchain_community.document_loaders import PyPDFLoader |
|
from langchain_core.prompts import PromptTemplate |
|
from langchain.chains import LLMChain |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.chains.mapreduce import MapReduceChain |
|
from langchain.prompts import PromptTemplate |
|
from langchain.docstore.document import Document |
|
from langchain.chains.summarize import load_summarize_chain |
|
|
|
import os |
|
import dotenv |
|
from dotenv import load_dotenv |
|
load_dotenv() |
|
|
|
|
|
hub_llm = HuggingFaceHub( |
|
repo_id="facebook/bart-large-cnn", |
|
model_kwargs={ |
|
"temperature": 0.01, |
|
"max_new_tokens": 256*2, |
|
"min_length": 30, |
|
"repetition_penalty": 1.2, |
|
"top_k": 50, |
|
"top_p": 0.95, |
|
"early_stopping": True, |
|
} |
|
) |
|
|
|
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
import gradio as gr |
|
import time |
|
|
|
title="PDF Summarizer" |
|
description="Summarize your PDF using a custom combine prompt." |
|
|
|
|
|
combine_prompt_template = """Write a comprehensive summary of this academic article. |
|
|
|
Divide the summary in: |
|
1. Main Objective of the paper |
|
2. Results |
|
|
|
{text} |
|
|
|
SUMMARY:""" |
|
|
|
|
|
pdf_example_1 = './ZeroShotDataAug.pdf' |
|
pdf_example_2 = './bert.pdf' |
|
prompt_example_1 = """Write a comprehensive summary of this academic article. |
|
|
|
Divide the summary in: |
|
1. Main Objective of the paper |
|
2. Results |
|
|
|
{text} |
|
|
|
SUMMARY:""" |
|
prompt_example_2 = """Summarize the following document focusing on the key findings and methodology. |
|
|
|
{text} |
|
|
|
Summary:""" |
|
|
|
|
|
def summarize(pdf_file, custom_prompt, custom_chunk, chunk_size, chunk_overlap): |
|
try: |
|
|
|
file_path = pdf_file.name |
|
|
|
|
|
loader = PyPDFLoader(file_path) |
|
if custom_chunk: |
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) |
|
docs = loader.load_and_split(text_splitter=text_splitter) |
|
else: |
|
docs = loader.load_and_split() |
|
|
|
PROMPT = PromptTemplate(template=custom_prompt, input_variables=['text']) |
|
chain = load_summarize_chain(hub_llm, chain_type='map_reduce', combine_prompt=PROMPT) |
|
|
|
|
|
time.sleep(2) |
|
summary = chain.invoke(docs)['output_text'] |
|
return summary |
|
except Exception as e: |
|
return f"An error occurred: {e}" |
|
|
|
upload_file_input = gr.UploadButton(label="Upload PDF", file_types=[".pdf"], file_count="single") |
|
custom_prompt_input = gr.Textbox(label="Custom Prompt", |
|
lines=10, |
|
value=combine_prompt_template, |
|
info="Define your own prompt or leave empty for default.") |
|
custom_chunk_input = gr.Checkbox(label="Custom Chunk", value=False, info="Recommended to be left unchecked") |
|
chunk_size_input = gr.Number(label="Chunk Size", value=700,minimum=500,maximum=1000,step=100) |
|
chunk_overlap_input = gr.Number(label="Chunk Overlap", value=50,minimum=10,maximum=100,step=100) |
|
|
|
examples=[ |
|
[pdf_example_1, prompt_example_1, False, 700, 50], |
|
|
|
] |
|
|
|
outputs = gr.Textbox(label="Summary") |
|
|
|
iface = gr.Interface( |
|
title=title, |
|
description=description, |
|
fn=summarize, |
|
inputs=[upload_file_input, |
|
custom_prompt_input, |
|
custom_chunk_input, |
|
chunk_size_input, |
|
chunk_overlap_input |
|
], |
|
outputs=outputs, |
|
examples=examples, |
|
) |
|
|
|
iface.launch( |
|
debug=False, |
|
|
|
) |
|
|