File size: 5,734 Bytes
338915e
74f32ba
338915e
 
 
 
 
74f32ba
338915e
 
 
851d7c4
338915e
 
74f32ba
338915e
 
 
 
 
 
 
74f32ba
338915e
74f32ba
338915e
74f32ba
338915e
74f32ba
338915e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74f32ba
338915e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from transformers import pipeline
import requests
import xml.etree.ElementTree as ET
from datetime import datetime
import pandas as pd
from fpdf import FPDF
import os
import gradio as gr
from tqdm import tqdm

# Load Hugging Face Summarization Pipeline
summarizer = pipeline("summarization", model="Falconsai/text_summarization")

def summarise_paper(text):
    try:
        summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        return f"Error summarizing: {str(e)}"

def search_arxiv_to_dataframe_with_text_optimized(topic, start_date, end_date):
    url = "https://export.arxiv.org/api/query"
    params = {
        "search_query": f'all:%22{topic}%22',
        "start": 0,
        "max_results": 100,
        "sortBy": "submittedDate",
        "sortOrder": "descending",
    }

    response = requests.get(url, params=params)
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        entries = root.findall('{http://www.w3.org/2005/Atom}entry')

        start_date_dt = datetime.strptime(start_date, "%Y-%m-%d")
        end_date_dt = datetime.strptime(end_date, "%Y-%m-%d")

        data = []
        for entry in entries:
            publication_date = entry.find('{http://www.w3.org/2005/Atom}published').text
            publication_date_dt = datetime.strptime(publication_date.split("T")[0], "%Y-%m-%d")

            if start_date_dt <= publication_date_dt <= end_date_dt:
                title = entry.find('{http://www.w3.org/2005/Atom}title').text
                authors = ', '.join([author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')])
                abstract = entry.find('{http://www.w3.org/2005/Atom}summary').text
                link = entry.find('{http://www.w3.org/2005/Atom}id').text.replace("abs", "pdf") + ".pdf"

                data.append({
                    "Title": title,
                    "Authors": authors,
                    "Abstract": abstract,
                    "Published Date": publication_date,
                    "Link": link,
                })

        return pd.DataFrame(data) if data else pd.DataFrame()
    else:
        print(f"Error: {response.status_code}")
        return pd.DataFrame()

def add_summary_column(df):
    tqdm.pandas(desc="Summarizing Papers")
    df['Summary'] = df['Abstract'].progress_apply(summarise_paper)
    return df

class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Research Paper Summaries', border=False, ln=True, align='C')
        self.ln(10)

    def footer(self):
        self.set_y(-15)
        self.set_font('Arial', 'I', 8)
        self.cell(0, 10, f'Page {self.page_no()}', align='C')

def generate_combined_markdown_and_pdf(df, output_dir="output", output_name="all_papers"):
    os.makedirs(output_dir, exist_ok=True)
    markdown_file = os.path.join(output_dir, f"{output_name}.md")
    pdf_file = os.path.join(output_dir, f"{output_name}.pdf")

    with open(markdown_file, "w", encoding="utf-8") as md_file:
        md_file.write("# Research Paper Summaries\n\n")
        for _, row in df.iterrows():
            md_file.write(f"## {row['Title']}\n\n")
            md_file.write(f"**Authors**: {row['Authors']}\n\n")
            md_file.write(f"**Publication Date**: {row['Published Date']}\n\n")
            md_file.write("### Summary\n")
            md_file.write(f"{row['Summary']}\n\n")
            md_file.write("---\n\n")

    pdf = PDF()
    pdf.add_page()
    pdf.set_font('Arial', size=12)
    pdf.cell(0, 10, "Research Paper Summaries", ln=True, align='C')
    pdf.ln(10)

    for _, row in df.iterrows():
        title = row['Title']
        authors = row['Authors']
        publication_date = row['Published Date']
        summary = row['Summary']

        pdf.set_font('Arial', 'B', size=12)
        pdf.multi_cell(0, 10, f"Title: {title}\n".encode('latin-1', 'replace').decode('latin-1'))
        pdf.set_font('Arial', size=10)
        pdf.multi_cell(0, 10, f"Authors: {authors}\n".encode('latin-1', 'replace').decode('latin-1'))
        pdf.multi_cell(0, 10, f"Publication Date: {publication_date}\n".encode('latin-1', 'replace').decode('latin-1'))
        pdf.set_font('Arial', size=10)
        pdf.multi_cell(0, 10, "Summary:\n", align="L")
        pdf.multi_cell(0, 10, f"{summary}\n".encode('latin-1', 'replace').decode('latin-1'))
        pdf.ln(5)
        pdf.cell(0, 5, "-" * 100, ln=True)
        pdf.ln(5)

    pdf.output(pdf_file)
    return markdown_file, pdf_file


def fetch_and_summarize(topic, start_date, end_date, export_type):
    df = search_arxiv_to_dataframe_with_text_optimized(topic, start_date, end_date)
    df_with_summary = add_summary_column(df)
    markdown_path, pdf_path = generate_combined_markdown_and_pdf(df_with_summary)
    file_path = pdf_path if export_type == 'PDF' else markdown_path
    return df_with_summary[['Title', 'Authors', 'Published Date']], file_path

with gr.Blocks(theme=gr.themes.Glass()) as demo:
    topic = gr.Textbox(label="Topic")
    start_date = gr.Textbox(label="Start Date (YYYY-MM-DD)")
    end_date = gr.Textbox(label="End Date (YYYY-MM-DD)")
    export_type = gr.Radio(choices=['PDF', 'Markdown'], label="Export Type")
    output_table = gr.Dataframe(label="Summarized Papers")
    output_file = gr.File(label="Download Summary")
    fetch_button = gr.Button("Fetch and Summarize")
    fetch_button.click(fetch_and_summarize, inputs=[topic, start_date, end_date, export_type], outputs=[output_table, output_file])

if __name__ == "__main__":
    demo.launch(show_error=True, debug=True, share=True)