Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
import requests
|
5 |
+
import feedparser
|
6 |
+
import gradio as gr
|
7 |
+
from transformers import pipeline
|
8 |
+
import pdfkit
|
9 |
+
|
10 |
+
# Initialize the summarizer (this may download the model on first run)
|
11 |
+
summarizer = pipeline("summarization")
|
12 |
+
|
13 |
+
def summarize_papers(topic, start_date, end_date, output_format):
|
14 |
+
"""
|
15 |
+
Fetch papers from arXiv via its direct API based on a topic and date range,
|
16 |
+
summarize their abstracts, and return the result as either a markdown string
|
17 |
+
or a PDF file.
|
18 |
+
|
19 |
+
Parameters:
|
20 |
+
topic (str): The subject/topic to search for.
|
21 |
+
start_date (str): Start date in 'YYYY-MM-DD' format.
|
22 |
+
end_date (str): End date in 'YYYY-MM-DD' format.
|
23 |
+
output_format (str): "markdown" or "pdf".
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
str: Markdown text or file path to the generated PDF.
|
27 |
+
"""
|
28 |
+
try:
|
29 |
+
start_dt = datetime.datetime.strptime(start_date, "%Y-%m-%d")
|
30 |
+
end_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d")
|
31 |
+
except ValueError:
|
32 |
+
return "Invalid date format. Please use YYYY-MM-DD."
|
33 |
+
|
34 |
+
# Build the API query URL using the direct ArXiv API.
|
35 |
+
base_url = "http://export.arxiv.org/api/query"
|
36 |
+
search_query = "all:" + topic
|
37 |
+
params = {
|
38 |
+
"search_query": search_query,
|
39 |
+
"start": 0,
|
40 |
+
"max_results": 50, # Adjust as needed
|
41 |
+
"sortBy": "submittedDate",
|
42 |
+
"sortOrder": "descending"
|
43 |
+
}
|
44 |
+
|
45 |
+
# Fetch the XML feed from ArXiv
|
46 |
+
response = requests.get(base_url, params=params)
|
47 |
+
feed = feedparser.parse(response.text)
|
48 |
+
|
49 |
+
markdown_output = f"# ArXiv Papers on '{topic}'\n\n"
|
50 |
+
found = False
|
51 |
+
|
52 |
+
for entry in feed.entries:
|
53 |
+
# Parse the published date (e.g., '2022-01-15T18:00:00Z')
|
54 |
+
entry_date = datetime.datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
|
55 |
+
if start_dt <= entry_date <= end_dt:
|
56 |
+
found = True
|
57 |
+
# Summarize the abstract using Hugging Face pipeline
|
58 |
+
try:
|
59 |
+
summary_text = summarizer(entry.summary, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
|
60 |
+
except Exception as e:
|
61 |
+
summary_text = "Error in summarization."
|
62 |
+
|
63 |
+
# Build markdown content for each paper
|
64 |
+
markdown_output += f"## {entry.title}\n"
|
65 |
+
# entry.authors is a list of dicts with a 'name' field
|
66 |
+
authors = ", ".join([author.name for author in entry.authors])
|
67 |
+
markdown_output += f"**Authors:** {authors}\n\n"
|
68 |
+
markdown_output += f"**Published:** {entry_date.strftime('%Y-%m-%d')}\n\n"
|
69 |
+
markdown_output += f"**Summary:** {summary_text}\n\n"
|
70 |
+
markdown_output += f"**Link:** [arXiv]({entry.id})\n\n"
|
71 |
+
markdown_output += "---\n\n"
|
72 |
+
|
73 |
+
if not found:
|
74 |
+
markdown_output += "\n_No papers found within the specified date range._\n"
|
75 |
+
|
76 |
+
if output_format == "markdown":
|
77 |
+
return markdown_output
|
78 |
+
elif output_format == "pdf":
|
79 |
+
# Convert markdown to a simple HTML wrapper for PDF conversion.
|
80 |
+
html_content = "<html><body>" + markdown_output.replace("\n", "<br>") + "</body></html>"
|
81 |
+
temp_dir = tempfile.gettempdir()
|
82 |
+
pdf_path = os.path.join(temp_dir, "arxiv_summary.pdf")
|
83 |
+
try:
|
84 |
+
pdfkit.from_string(html_content, pdf_path)
|
85 |
+
return pdf_path # Gradio will treat this as a downloadable file.
|
86 |
+
except Exception as e:
|
87 |
+
return f"PDF conversion failed: {e}"
|
88 |
+
else:
|
89 |
+
return "Unsupported output format."
|
90 |
+
|
91 |
+
# Build the Gradio interface.
|
92 |
+
iface = gr.Interface(
|
93 |
+
fn=summarize_papers,
|
94 |
+
inputs=[
|
95 |
+
gr.Textbox(label="Topic", placeholder="e.g., deep learning"),
|
96 |
+
gr.Textbox(label="Start Date (YYYY-MM-DD)", placeholder="2022-01-01"),
|
97 |
+
gr.Textbox(label="End Date (YYYY-MM-DD)", placeholder="2022-12-31"),
|
98 |
+
gr.Radio(choices=["markdown", "pdf"], label="Output Format", value="markdown")
|
99 |
+
],
|
100 |
+
outputs=gr.outputs.Textbox(label="Result (Markdown or PDF File Path)"),
|
101 |
+
title="ArXiv Paper Summarizer",
|
102 |
+
description=(
|
103 |
+
"Enter a topic and a date range to fetch and summarize ArXiv papers. "
|
104 |
+
"The output can be returned as markdown text or as a downloadable PDF file."
|
105 |
+
),
|
106 |
+
)
|
107 |
+
|
108 |
+
iface.launch()
|