Spaces:

buelfhood
/

Arxiv_ret_sum

Running

App Files Files Community

buelfhood commited on 16 days ago

Commit

338915e

verified ·

1 Parent(s): cf98a8d

Update app.py

Browse files

Files changed (1) hide show

app.py +132 -100

app.py CHANGED Viewed

@@ -1,108 +1,140 @@
-import datetime
-import os
-import tempfile
 import requests
-import feedparser
 import gradio as gr
-from transformers import pipeline
-import pdfkit
-# Initialize the summarizer (this may download the model on first run)
-summarizer = pipeline("summarization")
-def summarize_papers(topic, start_date, end_date, output_format):
-    """
-    Fetch papers from arXiv via its direct API based on a topic and date range,
-    summarize their abstracts, and return the result as either a markdown string
-    or a PDF file.
-    Parameters:
-      topic (str): The subject/topic to search for.
-      start_date (str): Start date in 'YYYY-MM-DD' format.
-      end_date (str): End date in 'YYYY-MM-DD' format.
-      output_format (str): "markdown" or "pdf".
-    Returns:
-      str: Markdown text or file path to the generated PDF.
-    """
     try:
-        start_dt = datetime.datetime.strptime(start_date, "%Y-%m-%d")
-        end_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d")
-    except ValueError:
-        return "Invalid date format. Please use YYYY-MM-DD."
-    # Build the API query URL using the direct ArXiv API.
-    base_url = "http://export.arxiv.org/api/query"
-    search_query = "all:" + topic
     params = {
-        "search_query": search_query,
         "start": 0,
-        "max_results": 50,  # Adjust as needed
         "sortBy": "submittedDate",
-        "sortOrder": "descending"
     }
-    # Fetch the XML feed from ArXiv
-    response = requests.get(base_url, params=params)
-    feed = feedparser.parse(response.text)
-    markdown_output = f"# ArXiv Papers on '{topic}'\n\n"
-    found = False
-    for entry in feed.entries:
-        # Parse the published date (e.g., '2022-01-15T18:00:00Z')
-        entry_date = datetime.datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
-        if start_dt <= entry_date <= end_dt:
-            found = True
-            # Summarize the abstract using Hugging Face pipeline
-            try:
-                summary_text = summarizer(entry.summary, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
-            except Exception as e:
-                summary_text = "Error in summarization."
-            # Build markdown content for each paper
-            markdown_output += f"## {entry.title}\n"
-            # entry.authors is a list of dicts with a 'name' field
-            authors = ", ".join([author.name for author in entry.authors])
-            markdown_output += f"**Authors:** {authors}\n\n"
-            markdown_output += f"**Published:** {entry_date.strftime('%Y-%m-%d')}\n\n"
-            markdown_output += f"**Summary:** {summary_text}\n\n"
-            markdown_output += f"**Link:** [arXiv]({entry.id})\n\n"
-            markdown_output += "---\n\n"
-    if not found:
-        markdown_output += "\n_No papers found within the specified date range._\n"
-    if output_format == "markdown":
-        return markdown_output
-    elif output_format == "pdf":
-        # Convert markdown to a simple HTML wrapper for PDF conversion.
-        html_content = "<html><body>" + markdown_output.replace("\n", "<br>") + "</body></html>"
-        temp_dir = tempfile.gettempdir()
-        pdf_path = os.path.join(temp_dir, "arxiv_summary.pdf")
-        try:
-            pdfkit.from_string(html_content, pdf_path)
-            return pdf_path  # Gradio will treat this as a downloadable file.
-        except Exception as e:
-            return f"PDF conversion failed: {e}"
     else:
-        return "Unsupported output format."
-# Build the Gradio interface.
-iface = gr.Interface(
-    fn=summarize_papers,
-    inputs=[
-        gr.Textbox(label="Topic", placeholder="e.g., deep learning"),
-        gr.Textbox(label="Start Date (YYYY-MM-DD)", placeholder="2022-01-01"),
-        gr.Textbox(label="End Date (YYYY-MM-DD)", placeholder="2022-12-31"),
-        gr.Radio(choices=["markdown", "pdf"], label="Output Format", value="markdown")
-    ],
-    outputs=gr.outputs.Textbox(label="Result (Markdown or PDF File Path)"),
-    title="ArXiv Paper Summarizer",
-    description=(
-        "Enter a topic and a date range to fetch and summarize ArXiv papers. "
-        "The output can be returned as markdown text or as a downloadable PDF file."
-    ),
-)
-iface.launch()

+from transformers import pipeline
 import requests
+import xml.etree.ElementTree as ET
+from datetime import datetime
+import pandas as pd
+from fpdf import FPDF
+import os
 import gradio as gr
+from tqdm import tqdm
+# Load Hugging Face Summarization Pipeline
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+def summarise_paper(text):
     try:
+        summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
+        return summary[0]['summary_text']
+    except Exception as e:
+        return f"Error summarizing: {str(e)}"
+def search_arxiv_to_dataframe_with_text_optimized(topic, start_date, end_date):
+    url = "https://export.arxiv.org/api/query"
     params = {
+        "search_query": f'all:%22{topic}%22',
         "start": 0,
+        "max_results": 100,
         "sortBy": "submittedDate",
+        "sortOrder": "descending",
     }
+    response = requests.get(url, params=params)
+    if response.status_code == 200:
+        root = ET.fromstring(response.content)
+        entries = root.findall('{http://www.w3.org/2005/Atom}entry')
+        start_date_dt = datetime.strptime(start_date, "%Y-%m-%d")
+        end_date_dt = datetime.strptime(end_date, "%Y-%m-%d")
+        data = []
+        for entry in entries:
+            publication_date = entry.find('{http://www.w3.org/2005/Atom}published').text
+            publication_date_dt = datetime.strptime(publication_date.split("T")[0], "%Y-%m-%d")
+            if start_date_dt <= publication_date_dt <= end_date_dt:
+                title = entry.find('{http://www.w3.org/2005/Atom}title').text
+                authors = ', '.join([author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')])
+                abstract = entry.find('{http://www.w3.org/2005/Atom}summary').text
+                link = entry.find('{http://www.w3.org/2005/Atom}id').text.replace("abs", "pdf") + ".pdf"
+                data.append({
+                    "Title": title,
+                    "Authors": authors,
+                    "Abstract": abstract,
+                    "Published Date": publication_date,
+                    "Link": link,
+                })
+        return pd.DataFrame(data) if data else pd.DataFrame()
     else:
+        print(f"Error: {response.status_code}")
+        return pd.DataFrame()
+def add_summary_column(df):
+    tqdm.pandas(desc="Summarizing Papers")
+    df['Summary'] = df['Abstract'].progress_apply(summarise_paper)
+    return df
+class PDF(FPDF):
+    def header(self):
+        self.set_font('Arial', 'B', 12)
+        self.cell(0, 10, 'Research Paper Summaries', border=False, ln=True, align='C')
+        self.ln(10)
+    def footer(self):
+        self.set_y(-15)
+        self.set_font('Arial', 'I', 8)
+        self.cell(0, 10, f'Page {self.page_no()}', align='C')
+def generate_combined_markdown_and_pdf(df, output_dir="output", output_name="all_papers"):
+    os.makedirs(output_dir, exist_ok=True)
+    markdown_file = os.path.join(output_dir, f"{output_name}.md")
+    pdf_file = os.path.join(output_dir, f"{output_name}.pdf")
+    with open(markdown_file, "w", encoding="utf-8") as md_file:
+        md_file.write("# Research Paper Summaries\n\n")
+        for _, row in df.iterrows():
+            md_file.write(f"## {row['Title']}\n\n")
+            md_file.write(f"**Authors**: {row['Authors']}\n\n")
+            md_file.write(f"**Publication Date**: {row['Published Date']}\n\n")
+            md_file.write("### Summary\n")
+            md_file.write(f"{row['Summary']}\n\n")
+            md_file.write("---\n\n")
+    pdf = PDF()
+    pdf.add_page()
+    pdf.set_font('Arial', size=12)
+    pdf.cell(0, 10, "Research Paper Summaries", ln=True, align='C')
+    pdf.ln(10)
+    for _, row in df.iterrows():
+        title = row['Title']
+        authors = row['Authors']
+        publication_date = row['Published Date']
+        summary = row['Summary']
+        pdf.set_font('Arial', 'B', size=12)
+        pdf.multi_cell(0, 10, f"Title: {title}\n".encode('latin-1', 'replace').decode('latin-1'))
+        pdf.set_font('Arial', size=10)
+        pdf.multi_cell(0, 10, f"Authors: {authors}\n".encode('latin-1', 'replace').decode('latin-1'))
+        pdf.multi_cell(0, 10, f"Publication Date: {publication_date}\n".encode('latin-1', 'replace').decode('latin-1'))
+        pdf.set_font('Arial', size=10)
+        pdf.multi_cell(0, 10, "Summary:\n", align="L")
+        pdf.multi_cell(0, 10, f"{summary}\n".encode('latin-1', 'replace').decode('latin-1'))
+        pdf.ln(5)
+        pdf.cell(0, 5, "-" * 100, ln=True)
+        pdf.ln(5)
+    pdf.output(pdf_file)
+    return markdown_file, pdf_file
+def fetch_and_summarize(topic, start_date, end_date, export_type):
+    df = search_arxiv_to_dataframe_with_text_optimized(topic, start_date, end_date)
+    df_with_summary = add_summary_column(df)
+    markdown_path, pdf_path = generate_combined_markdown_and_pdf(df_with_summary)
+    file_path = pdf_path if export_type == 'PDF' else markdown_path
+    return df_with_summary[['Title', 'Authors', 'Published Date']], file_path
+with gr.Blocks(theme=gr.themes.Glass()) as demo:
+    topic = gr.Textbox(label="Topic")
+    start_date = gr.Textbox(label="Start Date (YYYY-MM-DD)")
+    end_date = gr.Textbox(label="End Date (YYYY-MM-DD)")
+    export_type = gr.Radio(choices=['PDF', 'Markdown'], label="Export Type")
+    output_table = gr.Dataframe(label="Summarized Papers")
+    output_file = gr.File(label="Download Summary")
+    fetch_button = gr.Button("Fetch and Summarize")
+    fetch_button.click(fetch_and_summarize, inputs=[topic, start_date, end_date, export_type], outputs=[output_table, output_file])
+if __name__ == "__main__":
+    demo.launch(show_error=True, debug=True, share=True)