Spaces:
Running
Running
Move PDF content extraction to a separate module
Browse files- app.py +4 -26
- helpers/file_manager.py +40 -0
app.py
CHANGED
|
@@ -19,9 +19,9 @@ from dotenv import load_dotenv
|
|
| 19 |
from langchain_community.chat_message_histories import StreamlitChatMessageHistory
|
| 20 |
from langchain_core.messages import HumanMessage
|
| 21 |
from langchain_core.prompts import ChatPromptTemplate
|
| 22 |
-
from pypdf import PdfReader
|
| 23 |
|
| 24 |
import global_config as gcfg
|
|
|
|
| 25 |
from global_config import GlobalConfig
|
| 26 |
from helpers import llm_helper, pptx_helper, text_helper
|
| 27 |
|
|
@@ -274,7 +274,9 @@ def set_up_chat_ui():
|
|
| 274 |
):
|
| 275 |
prompt_text = prompt.text or ''
|
| 276 |
if prompt['files']:
|
| 277 |
-
|
|
|
|
|
|
|
| 278 |
print(f'{prompt["files"]=}')
|
| 279 |
|
| 280 |
provider, llm_name = llm_helper.get_provider_model(
|
|
@@ -502,30 +504,6 @@ def generate_slide_deck(json_str: str) -> Union[pathlib.Path, None]:
|
|
| 502 |
return path
|
| 503 |
|
| 504 |
|
| 505 |
-
def get_pdf_contents(
|
| 506 |
-
pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
|
| 507 |
-
max_pages: int = GlobalConfig.MAX_PAGE_COUNT
|
| 508 |
-
) -> str:
|
| 509 |
-
"""
|
| 510 |
-
Extract the text contents from a PDF file.
|
| 511 |
-
|
| 512 |
-
:param pdf_file: The uploaded PDF file.
|
| 513 |
-
:param max_pages: The max no. of pages to extract contents from.
|
| 514 |
-
:return: The contents.
|
| 515 |
-
"""
|
| 516 |
-
|
| 517 |
-
print(f'{type(pdf_file)=}')
|
| 518 |
-
reader = PdfReader(pdf_file)
|
| 519 |
-
n_pages = min(max_pages, len(reader.pages))
|
| 520 |
-
text = ''
|
| 521 |
-
|
| 522 |
-
for page in range(n_pages):
|
| 523 |
-
page = reader.pages[page]
|
| 524 |
-
text += page.extract_text()
|
| 525 |
-
|
| 526 |
-
return text
|
| 527 |
-
|
| 528 |
-
|
| 529 |
def _is_it_refinement() -> bool:
|
| 530 |
"""
|
| 531 |
Whether it is the initial prompt or a refinement.
|
|
|
|
| 19 |
from langchain_community.chat_message_histories import StreamlitChatMessageHistory
|
| 20 |
from langchain_core.messages import HumanMessage
|
| 21 |
from langchain_core.prompts import ChatPromptTemplate
|
|
|
|
| 22 |
|
| 23 |
import global_config as gcfg
|
| 24 |
+
import helpers.file_manager as filem
|
| 25 |
from global_config import GlobalConfig
|
| 26 |
from helpers import llm_helper, pptx_helper, text_helper
|
| 27 |
|
|
|
|
| 274 |
):
|
| 275 |
prompt_text = prompt.text or ''
|
| 276 |
if prompt['files']:
|
| 277 |
+
# Apparently, Streamlit stores uploaded files in memory and clears on browser close
|
| 278 |
+
# https://docs.streamlit.io/knowledge-base/using-streamlit/where-file-uploader-store-when-deleted
|
| 279 |
+
st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(prompt['files'][0])
|
| 280 |
print(f'{prompt["files"]=}')
|
| 281 |
|
| 282 |
provider, llm_name = llm_helper.get_provider_model(
|
|
|
|
| 504 |
return path
|
| 505 |
|
| 506 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
def _is_it_refinement() -> bool:
|
| 508 |
"""
|
| 509 |
Whether it is the initial prompt or a refinement.
|
helpers/file_manager.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
File manager helper to work with uploaded files.
|
| 3 |
+
"""
|
| 4 |
+
import logging
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
import streamlit as st
|
| 9 |
+
from pypdf import PdfReader
|
| 10 |
+
|
| 11 |
+
sys.path.append('..')
|
| 12 |
+
sys.path.append('../..')
|
| 13 |
+
|
| 14 |
+
from global_config import GlobalConfig
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def get_pdf_contents(
|
| 21 |
+
pdf_file: st.runtime.uploaded_file_manager.UploadedFile,
|
| 22 |
+
max_pages: int = GlobalConfig.MAX_PAGE_COUNT
|
| 23 |
+
) -> str:
|
| 24 |
+
"""
|
| 25 |
+
Extract the text contents from a PDF file.
|
| 26 |
+
|
| 27 |
+
:param pdf_file: The uploaded PDF file.
|
| 28 |
+
:param max_pages: The max no. of pages to extract contents from.
|
| 29 |
+
:return: The contents.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
reader = PdfReader(pdf_file)
|
| 33 |
+
n_pages = min(max_pages, len(reader.pages))
|
| 34 |
+
text = ''
|
| 35 |
+
|
| 36 |
+
for page in range(n_pages):
|
| 37 |
+
page = reader.pages[page]
|
| 38 |
+
text += page.extract_text()
|
| 39 |
+
|
| 40 |
+
return text
|