buelfhood commited on
Commit
74f32ba
·
verified ·
1 Parent(s): fc0040b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -0
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import os
3
+ import tempfile
4
+ import requests
5
+ import feedparser
6
+ import gradio as gr
7
+ from transformers import pipeline
8
+ import pdfkit
9
+
10
+ # Initialize the summarizer (this may download the model on first run)
11
+ summarizer = pipeline("summarization")
12
+
13
+ def summarize_papers(topic, start_date, end_date, output_format):
14
+ """
15
+ Fetch papers from arXiv via its direct API based on a topic and date range,
16
+ summarize their abstracts, and return the result as either a markdown string
17
+ or a PDF file.
18
+
19
+ Parameters:
20
+ topic (str): The subject/topic to search for.
21
+ start_date (str): Start date in 'YYYY-MM-DD' format.
22
+ end_date (str): End date in 'YYYY-MM-DD' format.
23
+ output_format (str): "markdown" or "pdf".
24
+
25
+ Returns:
26
+ str: Markdown text or file path to the generated PDF.
27
+ """
28
+ try:
29
+ start_dt = datetime.datetime.strptime(start_date, "%Y-%m-%d")
30
+ end_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d")
31
+ except ValueError:
32
+ return "Invalid date format. Please use YYYY-MM-DD."
33
+
34
+ # Build the API query URL using the direct ArXiv API.
35
+ base_url = "http://export.arxiv.org/api/query"
36
+ search_query = "all:" + topic
37
+ params = {
38
+ "search_query": search_query,
39
+ "start": 0,
40
+ "max_results": 50, # Adjust as needed
41
+ "sortBy": "submittedDate",
42
+ "sortOrder": "descending"
43
+ }
44
+
45
+ # Fetch the XML feed from ArXiv
46
+ response = requests.get(base_url, params=params)
47
+ feed = feedparser.parse(response.text)
48
+
49
+ markdown_output = f"# ArXiv Papers on '{topic}'\n\n"
50
+ found = False
51
+
52
+ for entry in feed.entries:
53
+ # Parse the published date (e.g., '2022-01-15T18:00:00Z')
54
+ entry_date = datetime.datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
55
+ if start_dt <= entry_date <= end_dt:
56
+ found = True
57
+ # Summarize the abstract using Hugging Face pipeline
58
+ try:
59
+ summary_text = summarizer(entry.summary, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
60
+ except Exception as e:
61
+ summary_text = "Error in summarization."
62
+
63
+ # Build markdown content for each paper
64
+ markdown_output += f"## {entry.title}\n"
65
+ # entry.authors is a list of dicts with a 'name' field
66
+ authors = ", ".join([author.name for author in entry.authors])
67
+ markdown_output += f"**Authors:** {authors}\n\n"
68
+ markdown_output += f"**Published:** {entry_date.strftime('%Y-%m-%d')}\n\n"
69
+ markdown_output += f"**Summary:** {summary_text}\n\n"
70
+ markdown_output += f"**Link:** [arXiv]({entry.id})\n\n"
71
+ markdown_output += "---\n\n"
72
+
73
+ if not found:
74
+ markdown_output += "\n_No papers found within the specified date range._\n"
75
+
76
+ if output_format == "markdown":
77
+ return markdown_output
78
+ elif output_format == "pdf":
79
+ # Convert markdown to a simple HTML wrapper for PDF conversion.
80
+ html_content = "<html><body>" + markdown_output.replace("\n", "<br>") + "</body></html>"
81
+ temp_dir = tempfile.gettempdir()
82
+ pdf_path = os.path.join(temp_dir, "arxiv_summary.pdf")
83
+ try:
84
+ pdfkit.from_string(html_content, pdf_path)
85
+ return pdf_path # Gradio will treat this as a downloadable file.
86
+ except Exception as e:
87
+ return f"PDF conversion failed: {e}"
88
+ else:
89
+ return "Unsupported output format."
90
+
91
+ # Build the Gradio interface.
92
+ iface = gr.Interface(
93
+ fn=summarize_papers,
94
+ inputs=[
95
+ gr.Textbox(label="Topic", placeholder="e.g., deep learning"),
96
+ gr.Textbox(label="Start Date (YYYY-MM-DD)", placeholder="2022-01-01"),
97
+ gr.Textbox(label="End Date (YYYY-MM-DD)", placeholder="2022-12-31"),
98
+ gr.Radio(choices=["markdown", "pdf"], label="Output Format", value="markdown")
99
+ ],
100
+ outputs=gr.outputs.Textbox(label="Result (Markdown or PDF File Path)"),
101
+ title="ArXiv Paper Summarizer",
102
+ description=(
103
+ "Enter a topic and a date range to fetch and summarize ArXiv papers. "
104
+ "The output can be returned as markdown text or as a downloadable PDF file."
105
+ ),
106
+ )
107
+
108
+ iface.launch()