Spaces:
Running
Running
File size: 5,734 Bytes
338915e 74f32ba 338915e 74f32ba 338915e 851d7c4 338915e 74f32ba 338915e 74f32ba 338915e 74f32ba 338915e 74f32ba 338915e 74f32ba 338915e 74f32ba 338915e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
from transformers import pipeline
import requests
import xml.etree.ElementTree as ET
from datetime import datetime
import pandas as pd
from fpdf import FPDF
import os
import gradio as gr
from tqdm import tqdm
# Load Hugging Face Summarization Pipeline
summarizer = pipeline("summarization", model="Falconsai/text_summarization")
def summarise_paper(text):
try:
summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
return summary[0]['summary_text']
except Exception as e:
return f"Error summarizing: {str(e)}"
def search_arxiv_to_dataframe_with_text_optimized(topic, start_date, end_date):
url = "https://export.arxiv.org/api/query"
params = {
"search_query": f'all:%22{topic}%22',
"start": 0,
"max_results": 100,
"sortBy": "submittedDate",
"sortOrder": "descending",
}
response = requests.get(url, params=params)
if response.status_code == 200:
root = ET.fromstring(response.content)
entries = root.findall('{http://www.w3.org/2005/Atom}entry')
start_date_dt = datetime.strptime(start_date, "%Y-%m-%d")
end_date_dt = datetime.strptime(end_date, "%Y-%m-%d")
data = []
for entry in entries:
publication_date = entry.find('{http://www.w3.org/2005/Atom}published').text
publication_date_dt = datetime.strptime(publication_date.split("T")[0], "%Y-%m-%d")
if start_date_dt <= publication_date_dt <= end_date_dt:
title = entry.find('{http://www.w3.org/2005/Atom}title').text
authors = ', '.join([author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')])
abstract = entry.find('{http://www.w3.org/2005/Atom}summary').text
link = entry.find('{http://www.w3.org/2005/Atom}id').text.replace("abs", "pdf") + ".pdf"
data.append({
"Title": title,
"Authors": authors,
"Abstract": abstract,
"Published Date": publication_date,
"Link": link,
})
return pd.DataFrame(data) if data else pd.DataFrame()
else:
print(f"Error: {response.status_code}")
return pd.DataFrame()
def add_summary_column(df):
tqdm.pandas(desc="Summarizing Papers")
df['Summary'] = df['Abstract'].progress_apply(summarise_paper)
return df
class PDF(FPDF):
def header(self):
self.set_font('Arial', 'B', 12)
self.cell(0, 10, 'Research Paper Summaries', border=False, ln=True, align='C')
self.ln(10)
def footer(self):
self.set_y(-15)
self.set_font('Arial', 'I', 8)
self.cell(0, 10, f'Page {self.page_no()}', align='C')
def generate_combined_markdown_and_pdf(df, output_dir="output", output_name="all_papers"):
os.makedirs(output_dir, exist_ok=True)
markdown_file = os.path.join(output_dir, f"{output_name}.md")
pdf_file = os.path.join(output_dir, f"{output_name}.pdf")
with open(markdown_file, "w", encoding="utf-8") as md_file:
md_file.write("# Research Paper Summaries\n\n")
for _, row in df.iterrows():
md_file.write(f"## {row['Title']}\n\n")
md_file.write(f"**Authors**: {row['Authors']}\n\n")
md_file.write(f"**Publication Date**: {row['Published Date']}\n\n")
md_file.write("### Summary\n")
md_file.write(f"{row['Summary']}\n\n")
md_file.write("---\n\n")
pdf = PDF()
pdf.add_page()
pdf.set_font('Arial', size=12)
pdf.cell(0, 10, "Research Paper Summaries", ln=True, align='C')
pdf.ln(10)
for _, row in df.iterrows():
title = row['Title']
authors = row['Authors']
publication_date = row['Published Date']
summary = row['Summary']
pdf.set_font('Arial', 'B', size=12)
pdf.multi_cell(0, 10, f"Title: {title}\n".encode('latin-1', 'replace').decode('latin-1'))
pdf.set_font('Arial', size=10)
pdf.multi_cell(0, 10, f"Authors: {authors}\n".encode('latin-1', 'replace').decode('latin-1'))
pdf.multi_cell(0, 10, f"Publication Date: {publication_date}\n".encode('latin-1', 'replace').decode('latin-1'))
pdf.set_font('Arial', size=10)
pdf.multi_cell(0, 10, "Summary:\n", align="L")
pdf.multi_cell(0, 10, f"{summary}\n".encode('latin-1', 'replace').decode('latin-1'))
pdf.ln(5)
pdf.cell(0, 5, "-" * 100, ln=True)
pdf.ln(5)
pdf.output(pdf_file)
return markdown_file, pdf_file
def fetch_and_summarize(topic, start_date, end_date, export_type):
df = search_arxiv_to_dataframe_with_text_optimized(topic, start_date, end_date)
df_with_summary = add_summary_column(df)
markdown_path, pdf_path = generate_combined_markdown_and_pdf(df_with_summary)
file_path = pdf_path if export_type == 'PDF' else markdown_path
return df_with_summary[['Title', 'Authors', 'Published Date']], file_path
with gr.Blocks(theme=gr.themes.Glass()) as demo:
topic = gr.Textbox(label="Topic")
start_date = gr.Textbox(label="Start Date (YYYY-MM-DD)")
end_date = gr.Textbox(label="End Date (YYYY-MM-DD)")
export_type = gr.Radio(choices=['PDF', 'Markdown'], label="Export Type")
output_table = gr.Dataframe(label="Summarized Papers")
output_file = gr.File(label="Download Summary")
fetch_button = gr.Button("Fetch and Summarize")
fetch_button.click(fetch_and_summarize, inputs=[topic, start_date, end_date, export_type], outputs=[output_table, output_file])
if __name__ == "__main__":
demo.launch(show_error=True, debug=True, share=True)
|