Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import ast | |
| from nbformat.v4 import (new_notebook, new_markdown_cell, new_code_cell) | |
| import pdfplumber | |
| import google.generativeai as genai | |
| import nbformat | |
| import re | |
| def classify_page(statement): | |
| genai.configure(api_key='AIzaSyBjGNru-WJoLncbVrErEzJE184MgX1o_Kg') | |
| # Create the model | |
| generation_config = { | |
| "temperature": 0, | |
| "max_output_tokens": 8192, | |
| "response_mime_type": "text/plain", | |
| } | |
| model = genai.GenerativeModel( | |
| model_name="gemini-1.5-flash-002", | |
| generation_config=generation_config, | |
| ) | |
| chat_session = model.start_chat( | |
| history=[ | |
| ] | |
| ) | |
| prompt = f""" | |
| Group the following "Input" strings as substring blocks of "Code" or "Text". | |
| The response content shall be strictly just a sequence of Python touples where the first element of each touple either "Code" or "Text" and the second elemnt is the coressponding grouped substring block. | |
| Input: | |
| # Summary Report - Visual Exploration of Data on Covid-19 Outbreak, Worldwide. | |
| The data was scraped from 'worldometer' web site at https://www.worldometers.info/coronavirus/ and the analysis was carried out using 'Python' programming language and various related libraries. | |
| The worldometer web site provides the data more on cumulative basis and therefore, this report and effort also include the process of gathering daily data. | |
| # First, we start with the loading the required packages. | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import matplotlib.dates as mdates | |
| import requests | |
| Then we access the website link, read the web page content and do some pre-processing. | |
| fig, ax = plt.subplots() | |
| ax.get_yaxis().get_major_formatter().set_scientific(False) | |
| # Create a twin Axes object that shares the x-axis | |
| ax2 = ax.twinx() | |
| # Plot the new cumulative cases time-series in green | |
| plot_timeseries(ax, daily_data2.index, daily_data2['NewCases']+MissedOut_NewCases, | |
| "green", "Date" , "Cumulative no. confirmed of cases") | |
| # Plot the new cumulative deaths data in green | |
| plot_timeseries(ax2, daily_data2.index, daily_data2['NewDeaths']+MissedOut_NewDeaths, | |
| "orange", "Date" , "Cumulative no. of deaths") | |
| # Plot the new daily cases time-series in blue | |
| plot_timeseries(ax, daily_data1.index, daily_data1['NewCases'], "blue", "Date" , "Confirmed cases") | |
| response_content: | |
| [("Text", # Summary Report - Visual Exploration of Data on Covid-19 Outbreak, Worldwide. | |
| The data was scraped from 'worldometer' web site at https://www.worldometers.info/coronavirus/ and the analysis was carried out using 'Python' programming language and various related libraries. | |
| The worldometer web site provides the data more on cumulative basis and therefore, this report and effort also include the process of gathering daily data.), | |
| ("Code", # First, we start with the loading the required packages. | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import matplotlib.dates as mdates | |
| import requests), | |
| ("Text", Then we access the website link, read the web page content and do some pre-processing.), | |
| ("Code", fig, ax = plt.subplots() | |
| ax.get_yaxis().get_major_formatter().set_scientific(False) | |
| # Create a twin Axes object that shares the x-axis | |
| ax2 = ax.twinx() | |
| # Plot the new cumulative cases time-series in green | |
| plot_timeseries(ax, daily_data2.index, daily_data2['NewCases']+MissedOut_NewCases, | |
| "green", "Date" , "Cumulative no. confirmed of cases") | |
| # Plot the new cumulative deaths data in green | |
| plot_timeseries(ax2, daily_data2.index, daily_data2['NewDeaths']+MissedOut_NewDeaths, | |
| "orange", "Date" , "Cumulative no. of deaths") | |
| # Plot the new daily cases time-series in blue | |
| plot_timeseries(ax, daily_data1.index, daily_data1['NewCases'], "blue", "Date" , "Confirmed cases")] | |
| Now, classify this string: | |
| Input: {statement} | |
| """ | |
| response = chat_session.send_message(prompt) | |
| print(response.text) | |
| print(response.text.replace("```python\n", "").replace("```", "").strip()) | |
| response = response.text.replace("```python\n", "").replace("```", "").strip() | |
| response = re.sub(r"[^\x20-\x7E]", "", response) | |
| print(response) | |
| return ast.literal_eval(response) | |
| def create_notebook(file, tc, bc): | |
| notebook = new_notebook() | |
| with pdfplumber.open(file) as pdf: | |
| for p, page in enumerate(pdf.pages): | |
| # Extract the text from the PDF | |
| width, height = page.width, page.height | |
| top_crop = tc # Height of the header to exclude | |
| bottom_crop = bc # Height of the footer to exclude | |
| crop_box = (0, top_crop, width, height - bottom_crop) | |
| # Crop the page | |
| cropped_page = page.within_bbox(crop_box) | |
| text = cropped_page.extract_text() | |
| if not text: | |
| continue | |
| # Split the text into lines | |
| # lines = text.split('\n') | |
| blocks = classify_page(text) | |
| # print(blocks) | |
| for c, value in blocks: | |
| if c == "Code": | |
| notebook.cells.append(new_code_cell(value)) | |
| elif c == "Text": | |
| value = value.replace("\n", "\n\n") | |
| # notebook.cells.append(new_markdown_cell(value)) | |
| notebook.cells.append(new_markdown_cell(value.replace('[[','').replace(']',''))) | |
| print(f"Page No.{p+1} completed") | |
| file_path = file.split('.pdf')[0]+'.ipynb' | |
| # Write the notebook in UTF-8 encoding | |
| with open(file_path + '.ipynb', 'w', encoding="utf-8") as f: | |
| nbformat.write(notebook, f) | |
| print(f'{file_path} notebook created successfully.') | |
| return f'{file_path}' | |
| with gr.Blocks() as app: | |
| gr.Markdown("""# PDF to IPython Notebook Convertor App | |
| ## Upload your PDF document containing Python code and Text and press 'Process File' button to download the iPython Notebook. | |
| ### Adjust Top Crop and Bottom Crop values based on how much of top and bottom design content of your PDF document you want to eliminate.""") | |
| file_input = gr.File(label="Upload a PDF file") | |
| tc = gr.Slider(label='Top Crop in Pixels', value=25) | |
| bc = gr.Slider(label='Bottom Crop in pixels', value=25) | |
| download_button = gr.File(label="Download processed file") | |
| process_button = gr.Button("Process File") | |
| process_button.click( | |
| fn=create_notebook, | |
| inputs=[file_input, tc, bc], | |
| outputs=download_button | |
| ) | |
| app.launch(debug=True) |