Spaces:
Running
Running
from transformers import pipeline | |
import requests | |
import xml.etree.ElementTree as ET | |
from datetime import datetime | |
import pandas as pd | |
from fpdf import FPDF | |
import os | |
import gradio as gr | |
from tqdm import tqdm | |
# Load Hugging Face Summarization Pipeline | |
summarizer = pipeline("summarization", model="Falconsai/text_summarization") | |
def summarise_paper(text): | |
try: | |
summary = summarizer(text, max_length=150, min_length=50, do_sample=False) | |
return summary[0]['summary_text'] | |
except Exception as e: | |
return f"Error summarizing: {str(e)}" | |
def search_arxiv_to_dataframe_with_text_optimized(topic, start_date, end_date): | |
url = "https://export.arxiv.org/api/query" | |
params = { | |
"search_query": f'all:%22{topic}%22', | |
"start": 0, | |
"max_results": 100, | |
"sortBy": "submittedDate", | |
"sortOrder": "descending", | |
} | |
response = requests.get(url, params=params) | |
if response.status_code == 200: | |
root = ET.fromstring(response.content) | |
entries = root.findall('{http://www.w3.org/2005/Atom}entry') | |
start_date_dt = datetime.strptime(start_date, "%Y-%m-%d") | |
end_date_dt = datetime.strptime(end_date, "%Y-%m-%d") | |
data = [] | |
for entry in entries: | |
publication_date = entry.find('{http://www.w3.org/2005/Atom}published').text | |
publication_date_dt = datetime.strptime(publication_date.split("T")[0], "%Y-%m-%d") | |
if start_date_dt <= publication_date_dt <= end_date_dt: | |
title = entry.find('{http://www.w3.org/2005/Atom}title').text | |
authors = ', '.join([author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')]) | |
abstract = entry.find('{http://www.w3.org/2005/Atom}summary').text | |
link = entry.find('{http://www.w3.org/2005/Atom}id').text.replace("abs", "pdf") + ".pdf" | |
data.append({ | |
"Title": title, | |
"Authors": authors, | |
"Abstract": abstract, | |
"Published Date": publication_date, | |
"Link": link, | |
}) | |
return pd.DataFrame(data) if data else pd.DataFrame() | |
else: | |
print(f"Error: {response.status_code}") | |
return pd.DataFrame() | |
def add_summary_column(df): | |
tqdm.pandas(desc="Summarizing Papers") | |
df['Summary'] = df['Abstract'].progress_apply(summarise_paper) | |
return df | |
class PDF(FPDF): | |
def header(self): | |
self.set_font('Arial', 'B', 12) | |
self.cell(0, 10, 'Research Paper Summaries', border=False, ln=True, align='C') | |
self.ln(10) | |
def footer(self): | |
self.set_y(-15) | |
self.set_font('Arial', 'I', 8) | |
self.cell(0, 10, f'Page {self.page_no()}', align='C') | |
def generate_combined_markdown_and_pdf(df, output_dir="output", output_name="all_papers"): | |
os.makedirs(output_dir, exist_ok=True) | |
markdown_file = os.path.join(output_dir, f"{output_name}.md") | |
pdf_file = os.path.join(output_dir, f"{output_name}.pdf") | |
with open(markdown_file, "w", encoding="utf-8") as md_file: | |
md_file.write("# Research Paper Summaries\n\n") | |
for _, row in df.iterrows(): | |
md_file.write(f"## {row['Title']}\n\n") | |
md_file.write(f"**Authors**: {row['Authors']}\n\n") | |
md_file.write(f"**Publication Date**: {row['Published Date']}\n\n") | |
md_file.write("### Summary\n") | |
md_file.write(f"{row['Summary']}\n\n") | |
md_file.write("---\n\n") | |
pdf = PDF() | |
pdf.add_page() | |
pdf.set_font('Arial', size=12) | |
pdf.cell(0, 10, "Research Paper Summaries", ln=True, align='C') | |
pdf.ln(10) | |
for _, row in df.iterrows(): | |
title = row['Title'] | |
authors = row['Authors'] | |
publication_date = row['Published Date'] | |
summary = row['Summary'] | |
pdf.set_font('Arial', 'B', size=12) | |
pdf.multi_cell(0, 10, f"Title: {title}\n".encode('latin-1', 'replace').decode('latin-1')) | |
pdf.set_font('Arial', size=10) | |
pdf.multi_cell(0, 10, f"Authors: {authors}\n".encode('latin-1', 'replace').decode('latin-1')) | |
pdf.multi_cell(0, 10, f"Publication Date: {publication_date}\n".encode('latin-1', 'replace').decode('latin-1')) | |
pdf.set_font('Arial', size=10) | |
pdf.multi_cell(0, 10, "Summary:\n", align="L") | |
pdf.multi_cell(0, 10, f"{summary}\n".encode('latin-1', 'replace').decode('latin-1')) | |
pdf.ln(5) | |
pdf.cell(0, 5, "-" * 100, ln=True) | |
pdf.ln(5) | |
pdf.output(pdf_file) | |
return markdown_file, pdf_file | |
def fetch_and_summarize(topic, start_date, end_date, export_type): | |
df = search_arxiv_to_dataframe_with_text_optimized(topic, start_date, end_date) | |
df_with_summary = add_summary_column(df) | |
markdown_path, pdf_path = generate_combined_markdown_and_pdf(df_with_summary) | |
file_path = pdf_path if export_type == 'PDF' else markdown_path | |
return df_with_summary[['Title', 'Authors', 'Published Date']], file_path | |
with gr.Blocks(theme=gr.themes.Glass()) as demo: | |
topic = gr.Textbox(label="Topic") | |
start_date = gr.Textbox(label="Start Date (YYYY-MM-DD)") | |
end_date = gr.Textbox(label="End Date (YYYY-MM-DD)") | |
export_type = gr.Radio(choices=['PDF', 'Markdown'], label="Export Type") | |
output_table = gr.Dataframe(label="Summarized Papers") | |
output_file = gr.File(label="Download Summary") | |
fetch_button = gr.Button("Fetch and Summarize") | |
fetch_button.click(fetch_and_summarize, inputs=[topic, start_date, end_date, export_type], outputs=[output_table, output_file]) | |
if __name__ == "__main__": | |
demo.launch(show_error=True, debug=True, share=True) | |