from transformers import pipeline import requests import xml.etree.ElementTree as ET from datetime import datetime import pandas as pd from fpdf import FPDF import os import gradio as gr from tqdm import tqdm # Load Hugging Face Summarization Pipeline summarizer = pipeline("summarization", model="Falconsai/text_summarization") def summarise_paper(text): try: summary = summarizer(text, max_length=150, min_length=50, do_sample=False) return summary[0]['summary_text'] except Exception as e: return f"Error summarizing: {str(e)}" def search_arxiv_to_dataframe_with_text_optimized(topic, start_date, end_date): url = "https://export.arxiv.org/api/query" params = { "search_query": f'all:%22{topic}%22', "start": 0, "max_results": 100, "sortBy": "submittedDate", "sortOrder": "descending", } response = requests.get(url, params=params) if response.status_code == 200: root = ET.fromstring(response.content) entries = root.findall('{http://www.w3.org/2005/Atom}entry') start_date_dt = datetime.strptime(start_date, "%Y-%m-%d") end_date_dt = datetime.strptime(end_date, "%Y-%m-%d") data = [] for entry in entries: publication_date = entry.find('{http://www.w3.org/2005/Atom}published').text publication_date_dt = datetime.strptime(publication_date.split("T")[0], "%Y-%m-%d") if start_date_dt <= publication_date_dt <= end_date_dt: title = entry.find('{http://www.w3.org/2005/Atom}title').text authors = ', '.join([author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')]) abstract = entry.find('{http://www.w3.org/2005/Atom}summary').text link = entry.find('{http://www.w3.org/2005/Atom}id').text.replace("abs", "pdf") + ".pdf" data.append({ "Title": title, "Authors": authors, "Abstract": abstract, "Published Date": publication_date, "Link": link, }) return pd.DataFrame(data) if data else pd.DataFrame() else: print(f"Error: {response.status_code}") return pd.DataFrame() def add_summary_column(df): tqdm.pandas(desc="Summarizing Papers") df['Summary'] = df['Abstract'].progress_apply(summarise_paper) return df class PDF(FPDF): def header(self): self.set_font('Arial', 'B', 12) self.cell(0, 10, 'Research Paper Summaries', border=False, ln=True, align='C') self.ln(10) def footer(self): self.set_y(-15) self.set_font('Arial', 'I', 8) self.cell(0, 10, f'Page {self.page_no()}', align='C') def generate_combined_markdown_and_pdf(df, output_dir="output", output_name="all_papers"): os.makedirs(output_dir, exist_ok=True) markdown_file = os.path.join(output_dir, f"{output_name}.md") pdf_file = os.path.join(output_dir, f"{output_name}.pdf") with open(markdown_file, "w", encoding="utf-8") as md_file: md_file.write("# Research Paper Summaries\n\n") for _, row in df.iterrows(): md_file.write(f"## {row['Title']}\n\n") md_file.write(f"**Authors**: {row['Authors']}\n\n") md_file.write(f"**Publication Date**: {row['Published Date']}\n\n") md_file.write("### Summary\n") md_file.write(f"{row['Summary']}\n\n") md_file.write("---\n\n") pdf = PDF() pdf.add_page() pdf.set_font('Arial', size=12) pdf.cell(0, 10, "Research Paper Summaries", ln=True, align='C') pdf.ln(10) for _, row in df.iterrows(): title = row['Title'] authors = row['Authors'] publication_date = row['Published Date'] summary = row['Summary'] pdf.set_font('Arial', 'B', size=12) pdf.multi_cell(0, 10, f"Title: {title}\n".encode('latin-1', 'replace').decode('latin-1')) pdf.set_font('Arial', size=10) pdf.multi_cell(0, 10, f"Authors: {authors}\n".encode('latin-1', 'replace').decode('latin-1')) pdf.multi_cell(0, 10, f"Publication Date: {publication_date}\n".encode('latin-1', 'replace').decode('latin-1')) pdf.set_font('Arial', size=10) pdf.multi_cell(0, 10, "Summary:\n", align="L") pdf.multi_cell(0, 10, f"{summary}\n".encode('latin-1', 'replace').decode('latin-1')) pdf.ln(5) pdf.cell(0, 5, "-" * 100, ln=True) pdf.ln(5) pdf.output(pdf_file) return markdown_file, pdf_file def fetch_and_summarize(topic, start_date, end_date, export_type): df = search_arxiv_to_dataframe_with_text_optimized(topic, start_date, end_date) df_with_summary = add_summary_column(df) markdown_path, pdf_path = generate_combined_markdown_and_pdf(df_with_summary) file_path = pdf_path if export_type == 'PDF' else markdown_path return df_with_summary[['Title', 'Authors', 'Published Date']], file_path with gr.Blocks(theme=gr.themes.Glass()) as demo: topic = gr.Textbox(label="Topic") start_date = gr.Textbox(label="Start Date (YYYY-MM-DD)") end_date = gr.Textbox(label="End Date (YYYY-MM-DD)") export_type = gr.Radio(choices=['PDF', 'Markdown'], label="Export Type") output_table = gr.Dataframe(label="Summarized Papers") output_file = gr.File(label="Download Summary") fetch_button = gr.Button("Fetch and Summarize") fetch_button.click(fetch_and_summarize, inputs=[topic, start_date, end_date, export_type], outputs=[output_table, output_file]) if __name__ == "__main__": demo.launch(show_error=True, debug=True, share=True)