Spaces:

buelfhood
/

Arxiv_ret_sum

Running

File size: 5,734 Bytes

from transformers import pipeline
import requests
import xml.etree.ElementTree as ET
from datetime import datetime
import pandas as pd
from fpdf import FPDF
import os
import gradio as gr
from tqdm import tqdm

# Load Hugging Face Summarization Pipeline
summarizer = pipeline("summarization", model="Falconsai/text_summarization")

def summarise_paper(text):
    try:
        summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        return f"Error summarizing: {str(e)}"

def search_arxiv_to_dataframe_with_text_optimized(topic, start_date, end_date):
    url = "https://export.arxiv.org/api/query"
    params = {
        "search_query": f'all:%22{topic}%22',
        "start": 0,
        "max_results": 100,
        "sortBy": "submittedDate",
        "sortOrder": "descending",
    }

    response = requests.get(url, params=params)
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        entries = root.findall('{http://www.w3.org/2005/Atom}entry')

        start_date_dt = datetime.strptime(start_date, "%Y-%m-%d")
        end_date_dt = datetime.strptime(end_date, "%Y-%m-%d")

        data = []
        for entry in entries:
            publication_date = entry.find('{http://www.w3.org/2005/Atom}published').text
            publication_date_dt = datetime.strptime(publication_date.split("T")[0], "%Y-%m-%d")

            if start_date_dt <= publication_date_dt <= end_date_dt:
                title = entry.find('{http://www.w3.org/2005/Atom}title').text
                authors = ', '.join([author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')])
                abstract = entry.find('{http://www.w3.org/2005/Atom}summary').text
                link = entry.find('{http://www.w3.org/2005/Atom}id').text.replace("abs", "pdf") + ".pdf"

                data.append({
                    "Title": title,
                    "Authors": authors,
                    "Abstract": abstract,
                    "Published Date": publication_date,
                    "Link": link,
                })

        return pd.DataFrame(data) if data else pd.DataFrame()
    else:
        print(f"Error: {response.status_code}")
        return pd.DataFrame()

def add_summary_column(df):
    tqdm.pandas(desc="Summarizing Papers")
    df['Summary'] = df['Abstract'].progress_apply(summarise_paper)
    return df

class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Research Paper Summaries', border=False, ln=True, align='C')
        self.ln(10)

    def footer(self):
        self.set_y(-15)
        self.set_font('Arial', 'I', 8)
        self.cell(0, 10, f'Page {self.page_no()}', align='C')

def generate_combined_markdown_and_pdf(df, output_dir="output", output_name="all_papers"):
    os.makedirs(output_dir, exist_ok=True)
    markdown_file = os.path.join(output_dir, f"{output_name}.md")
    pdf_file = os.path.join(output_dir, f"{output_name}.pdf")

    with open(markdown_file, "w", encoding="utf-8") as md_file:
        md_file.write("# Research Paper Summaries\n\n")
        for _, row in df.iterrows():
            md_file.write(f"## {row['Title']}\n\n")
            md_file.write(f"**Authors**: {row['Authors']}\n\n")
            md_file.write(f"**Publication Date**: {row['Published Date']}\n\n")
            md_file.write("### Summary\n")
            md_file.write(f"{row['Summary']}\n\n")
            md_file.write("---\n\n")

    pdf = PDF()
    pdf.add_page()
    pdf.set_font('Arial', size=12)
    pdf.cell(0, 10, "Research Paper Summaries", ln=True, align='C')
    pdf.ln(10)

    for _, row in df.iterrows():
        title = row['Title']
        authors = row['Authors']
        publication_date = row['Published Date']
        summary = row['Summary']

        pdf.set_font('Arial', 'B', size=12)
        pdf.multi_cell(0, 10, f"Title: {title}\n".encode('latin-1', 'replace').decode('latin-1'))
        pdf.set_font('Arial', size=10)
        pdf.multi_cell(0, 10, f"Authors: {authors}\n".encode('latin-1', 'replace').decode('latin-1'))
        pdf.multi_cell(0, 10, f"Publication Date: {publication_date}\n".encode('latin-1', 'replace').decode('latin-1'))
        pdf.set_font('Arial', size=10)
        pdf.multi_cell(0, 10, "Summary:\n", align="L")
        pdf.multi_cell(0, 10, f"{summary}\n".encode('latin-1', 'replace').decode('latin-1'))
        pdf.ln(5)
        pdf.cell(0, 5, "-" * 100, ln=True)
        pdf.ln(5)

    pdf.output(pdf_file)
    return markdown_file, pdf_file


def fetch_and_summarize(topic, start_date, end_date, export_type):
    df = search_arxiv_to_dataframe_with_text_optimized(topic, start_date, end_date)
    df_with_summary = add_summary_column(df)
    markdown_path, pdf_path = generate_combined_markdown_and_pdf(df_with_summary)
    file_path = pdf_path if export_type == 'PDF' else markdown_path
    return df_with_summary[['Title', 'Authors', 'Published Date']], file_path

with gr.Blocks(theme=gr.themes.Glass()) as demo:
    topic = gr.Textbox(label="Topic")
    start_date = gr.Textbox(label="Start Date (YYYY-MM-DD)")
    end_date = gr.Textbox(label="End Date (YYYY-MM-DD)")
    export_type = gr.Radio(choices=['PDF', 'Markdown'], label="Export Type")
    output_table = gr.Dataframe(label="Summarized Papers")
    output_file = gr.File(label="Download Summary")
    fetch_button = gr.Button("Fetch and Summarize")
    fetch_button.click(fetch_and_summarize, inputs=[topic, start_date, end_date, export_type], outputs=[output_table, output_file])

if __name__ == "__main__":
    demo.launch(show_error=True, debug=True, share=True)