Spaces:

buelfhood
/

Arxiv_ret_sum

Running

App Files Files Community

Arxiv_ret_sum / app.py

buelfhood

Update app.py

851d7c4 verified 6 days ago

raw

history blame contribute delete

5.73 kB

	from transformers import pipeline
	import requests
	import xml.etree.ElementTree as ET
	from datetime import datetime
	import pandas as pd
	from fpdf import FPDF
	import os
	import gradio as gr
	from tqdm import tqdm

	# Load Hugging Face Summarization Pipeline
	summarizer = pipeline("summarization", model="Falconsai/text_summarization")

	def summarise_paper(text):
	try:
	summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
	return summary[0]['summary_text']
	except Exception as e:
	return f"Error summarizing: {str(e)}"

	def search_arxiv_to_dataframe_with_text_optimized(topic, start_date, end_date):
	url = "https://export.arxiv.org/api/query"
	params = {
	"search_query": f'all:%22{topic}%22',
	"start": 0,
	"max_results": 100,
	"sortBy": "submittedDate",
	"sortOrder": "descending",
	}

	response = requests.get(url, params=params)
	if response.status_code == 200:
	root = ET.fromstring(response.content)
	entries = root.findall('{http://www.w3.org/2005/Atom}entry')

	start_date_dt = datetime.strptime(start_date, "%Y-%m-%d")
	end_date_dt = datetime.strptime(end_date, "%Y-%m-%d")

	data = []
	for entry in entries:
	publication_date = entry.find('{http://www.w3.org/2005/Atom}published').text
	publication_date_dt = datetime.strptime(publication_date.split("T")[0], "%Y-%m-%d")

	if start_date_dt <= publication_date_dt <= end_date_dt:
	title = entry.find('{http://www.w3.org/2005/Atom}title').text
	authors = ', '.join([author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')])
	abstract = entry.find('{http://www.w3.org/2005/Atom}summary').text
	link = entry.find('{http://www.w3.org/2005/Atom}id').text.replace("abs", "pdf") + ".pdf"

	data.append({
	"Title": title,
	"Authors": authors,
	"Abstract": abstract,
	"Published Date": publication_date,
	"Link": link,
	})

	return pd.DataFrame(data) if data else pd.DataFrame()
	else:
	print(f"Error: {response.status_code}")
	return pd.DataFrame()

	def add_summary_column(df):
	tqdm.pandas(desc="Summarizing Papers")
	df['Summary'] = df['Abstract'].progress_apply(summarise_paper)
	return df

	class PDF(FPDF):
	def header(self):
	self.set_font('Arial', 'B', 12)
	self.cell(0, 10, 'Research Paper Summaries', border=False, ln=True, align='C')
	self.ln(10)

	def footer(self):
	self.set_y(-15)
	self.set_font('Arial', 'I', 8)
	self.cell(0, 10, f'Page {self.page_no()}', align='C')

	def generate_combined_markdown_and_pdf(df, output_dir="output", output_name="all_papers"):
	os.makedirs(output_dir, exist_ok=True)
	markdown_file = os.path.join(output_dir, f"{output_name}.md")
	pdf_file = os.path.join(output_dir, f"{output_name}.pdf")

	with open(markdown_file, "w", encoding="utf-8") as md_file:
	md_file.write("# Research Paper Summaries\n\n")
	for _, row in df.iterrows():
	md_file.write(f"## {row['Title']}\n\n")
	md_file.write(f"Authors: {row['Authors']}\n\n")
	md_file.write(f"Publication Date: {row['Published Date']}\n\n")
	md_file.write("### Summary\n")
	md_file.write(f"{row['Summary']}\n\n")
	md_file.write("---\n\n")

	pdf = PDF()
	pdf.add_page()
	pdf.set_font('Arial', size=12)
	pdf.cell(0, 10, "Research Paper Summaries", ln=True, align='C')
	pdf.ln(10)

	for _, row in df.iterrows():
	title = row['Title']
	authors = row['Authors']
	publication_date = row['Published Date']
	summary = row['Summary']

	pdf.set_font('Arial', 'B', size=12)
	pdf.multi_cell(0, 10, f"Title: {title}\n".encode('latin-1', 'replace').decode('latin-1'))
	pdf.set_font('Arial', size=10)
	pdf.multi_cell(0, 10, f"Authors: {authors}\n".encode('latin-1', 'replace').decode('latin-1'))
	pdf.multi_cell(0, 10, f"Publication Date: {publication_date}\n".encode('latin-1', 'replace').decode('latin-1'))
	pdf.set_font('Arial', size=10)
	pdf.multi_cell(0, 10, "Summary:\n", align="L")
	pdf.multi_cell(0, 10, f"{summary}\n".encode('latin-1', 'replace').decode('latin-1'))
	pdf.ln(5)
	pdf.cell(0, 5, "-" * 100, ln=True)
	pdf.ln(5)

	pdf.output(pdf_file)
	return markdown_file, pdf_file


	def fetch_and_summarize(topic, start_date, end_date, export_type):
	df = search_arxiv_to_dataframe_with_text_optimized(topic, start_date, end_date)
	df_with_summary = add_summary_column(df)
	markdown_path, pdf_path = generate_combined_markdown_and_pdf(df_with_summary)
	file_path = pdf_path if export_type == 'PDF' else markdown_path
	return df_with_summary[['Title', 'Authors', 'Published Date']], file_path

	with gr.Blocks(theme=gr.themes.Glass()) as demo:
	topic = gr.Textbox(label="Topic")
	start_date = gr.Textbox(label="Start Date (YYYY-MM-DD)")
	end_date = gr.Textbox(label="End Date (YYYY-MM-DD)")
	export_type = gr.Radio(choices=['PDF', 'Markdown'], label="Export Type")
	output_table = gr.Dataframe(label="Summarized Papers")
	output_file = gr.File(label="Download Summary")
	fetch_button = gr.Button("Fetch and Summarize")
	fetch_button.click(fetch_and_summarize, inputs=[topic, start_date, end_date, export_type], outputs=[output_table, output_file])

	if __name__ == "__main__":
	demo.launch(show_error=True, debug=True, share=True)