Spaces:
Sleeping
Sleeping
Istvan-Adem
commited on
Commit
·
67deb87
1
Parent(s):
70de892
add pytesseract
Browse files- ocr/__init__.py +0 -12
- ocr/api/message/openai_request.py +2 -2
- ocr/api/message/prompts.py +2 -2
- ocr/api/message/utils.py +10 -15
- ocr/api/message/views.py +3 -3
- requirements.txt +2 -0
ocr/__init__.py
CHANGED
@@ -1,9 +1,6 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
from fastapi import FastAPI
|
4 |
from fastapi.middleware.cors import CORSMiddleware
|
5 |
from starlette.exceptions import HTTPException as StarletteHTTPException
|
6 |
-
from starlette.staticfiles import StaticFiles
|
7 |
|
8 |
from ocr.core.config import settings
|
9 |
from ocr.core.wrappers import OcrResponseWrapper, ErrorOcrResponse
|
@@ -22,15 +19,6 @@ def create_app() -> FastAPI:
|
|
22 |
allow_headers=["*"],
|
23 |
)
|
24 |
|
25 |
-
static_directory = os.path.join(settings.BASE_DIR, 'static')
|
26 |
-
if not os.path.exists(static_directory):
|
27 |
-
os.makedirs(static_directory)
|
28 |
-
|
29 |
-
app.mount(
|
30 |
-
'/static',
|
31 |
-
StaticFiles(directory='static'),
|
32 |
-
)
|
33 |
-
|
34 |
@app.exception_handler(StarletteHTTPException)
|
35 |
async def http_exception_handler(_, exc):
|
36 |
return OcrResponseWrapper(
|
|
|
|
|
|
|
1 |
from fastapi import FastAPI
|
2 |
from fastapi.middleware.cors import CORSMiddleware
|
3 |
from starlette.exceptions import HTTPException as StarletteHTTPException
|
|
|
4 |
|
5 |
from ocr.core.config import settings
|
6 |
from ocr.core.wrappers import OcrResponseWrapper, ErrorOcrResponse
|
|
|
19 |
allow_headers=["*"],
|
20 |
)
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
@app.exception_handler(StarletteHTTPException)
|
23 |
async def http_exception_handler(_, exc):
|
24 |
return OcrResponseWrapper(
|
ocr/api/message/openai_request.py
CHANGED
@@ -3,7 +3,7 @@ from ocr.core.wrappers import openai_wrapper
|
|
3 |
|
4 |
|
5 |
@openai_wrapper(model='gpt-4o-mini')
|
6 |
-
async def generate_report(
|
7 |
messages = [
|
8 |
{
|
9 |
"role": "system",
|
@@ -11,7 +11,7 @@ async def generate_report(request_content: list[dict]):
|
|
11 |
},
|
12 |
{
|
13 |
"role": "user",
|
14 |
-
"content":
|
15 |
}
|
16 |
]
|
17 |
return messages
|
|
|
3 |
|
4 |
|
5 |
@openai_wrapper(model='gpt-4o-mini')
|
6 |
+
async def generate_report(text: str):
|
7 |
messages = [
|
8 |
{
|
9 |
"role": "system",
|
|
|
11 |
},
|
12 |
{
|
13 |
"role": "user",
|
14 |
+
"content": f"Generate a report based on this data:\n\n```\n{text}\n```"
|
15 |
}
|
16 |
]
|
17 |
return messages
|
ocr/api/message/prompts.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
class OCRPrompts:
|
2 |
generate_general_answer = """## Task
|
3 |
|
4 |
-
You must analyze the
|
5 |
|
6 |
## Report Structure
|
7 |
|
@@ -38,7 +38,7 @@ The report should be structured as follows, with each section containing only re
|
|
38 |
|
39 |
## Instructions
|
40 |
|
41 |
-
- **Do not invent or infer any information.** Only use data provided in the
|
42 |
- Ensure that the format is followed strictly, and the output is complete without any deviations.
|
43 |
|
44 |
[/INST]"""
|
|
|
1 |
class OCRPrompts:
|
2 |
generate_general_answer = """## Task
|
3 |
|
4 |
+
You must analyze the text extracted from medical document and generate a comprehensive report in **Markdown2** format. Ensure that every detail provided in the document is included, and do not omit or modify any information. Your output must strictly follow the required format.
|
5 |
|
6 |
## Report Structure
|
7 |
|
|
|
38 |
|
39 |
## Instructions
|
40 |
|
41 |
+
- **Do not invent or infer any information.** Only use data provided in the user request.
|
42 |
- Ensure that the format is followed strictly, and the output is complete without any deviations.
|
43 |
|
44 |
[/INST]"""
|
ocr/api/message/utils.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
-
import base64
|
2 |
import io
|
3 |
import re
|
4 |
|
|
|
|
|
5 |
from pdf2image import convert_from_bytes
|
6 |
|
7 |
|
@@ -16,21 +17,15 @@ def divide_images(contents: bytes) -> list[bytes]:
|
|
16 |
return image_bytes_list
|
17 |
|
18 |
|
19 |
-
def
|
20 |
-
|
21 |
-
{"type": "text", "text": "Generate a report on the attached document"},
|
22 |
-
*[
|
23 |
-
{
|
24 |
-
"type": "image_url",
|
25 |
-
"image_url": {
|
26 |
-
"url": f"data:image/jpeg;base64,{base64.b64encode(image).decode('utf-8')}",
|
27 |
-
},
|
28 |
-
}
|
29 |
-
for image in images
|
30 |
-
]
|
31 |
-
]
|
32 |
-
return content
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
def clean_response(text: str) -> str:
|
36 |
try:
|
|
|
|
|
1 |
import io
|
2 |
import re
|
3 |
|
4 |
+
import pytesseract
|
5 |
+
from PIL import Image
|
6 |
from pdf2image import convert_from_bytes
|
7 |
|
8 |
|
|
|
17 |
return image_bytes_list
|
18 |
|
19 |
|
20 |
+
def extract_text_from_images(images: list[bytes]) -> str:
|
21 |
+
extracted_texts = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
+
for image_bytes in images:
|
24 |
+
image = Image.open(io.BytesIO(image_bytes))
|
25 |
+
text = pytesseract.image_to_string(image)
|
26 |
+
extracted_texts.append(text)
|
27 |
+
|
28 |
+
return '\n'.join(extracted_texts)
|
29 |
|
30 |
def clean_response(text: str) -> str:
|
31 |
try:
|
ocr/api/message/views.py
CHANGED
@@ -3,7 +3,7 @@ from fastapi import File, UploadFile
|
|
3 |
from ocr.api.message import ocr_router
|
4 |
from ocr.api.message.openai_request import generate_report
|
5 |
from ocr.api.message.schemas import OcrResponse
|
6 |
-
from ocr.api.message.utils import divide_images,
|
7 |
from ocr.core.wrappers import OcrResponseWrapper
|
8 |
|
9 |
|
@@ -14,8 +14,8 @@ async def get_all_chat_messages(
|
|
14 |
try:
|
15 |
contents = await file.read()
|
16 |
images = divide_images(contents)
|
17 |
-
|
18 |
-
response = await generate_report(
|
19 |
return OcrResponseWrapper(data=OcrResponse(text=clean_response(response)))
|
20 |
finally:
|
21 |
await file.close()
|
|
|
3 |
from ocr.api.message import ocr_router
|
4 |
from ocr.api.message.openai_request import generate_report
|
5 |
from ocr.api.message.schemas import OcrResponse
|
6 |
+
from ocr.api.message.utils import divide_images, clean_response, extract_text_from_images
|
7 |
from ocr.core.wrappers import OcrResponseWrapper
|
8 |
|
9 |
|
|
|
14 |
try:
|
15 |
contents = await file.read()
|
16 |
images = divide_images(contents)
|
17 |
+
text_content = extract_text_from_images(images)
|
18 |
+
response = await generate_report(text_content)
|
19 |
return OcrResponseWrapper(data=OcrResponse(text=clean_response(response)))
|
20 |
finally:
|
21 |
await file.close()
|
requirements.txt
CHANGED
@@ -11,11 +11,13 @@ httpx==0.28.1
|
|
11 |
idna==3.10
|
12 |
jiter==0.8.2
|
13 |
openai==1.59.9
|
|
|
14 |
pdf2image==1.17.0
|
15 |
pillow==11.1.0
|
16 |
pydantic==2.10.5
|
17 |
pydantic_core==2.27.2
|
18 |
pydash==8.0.5
|
|
|
19 |
python-dotenv==1.0.1
|
20 |
python-multipart==0.0.20
|
21 |
PyYAML==6.0.2
|
|
|
11 |
idna==3.10
|
12 |
jiter==0.8.2
|
13 |
openai==1.59.9
|
14 |
+
packaging==24.2
|
15 |
pdf2image==1.17.0
|
16 |
pillow==11.1.0
|
17 |
pydantic==2.10.5
|
18 |
pydantic_core==2.27.2
|
19 |
pydash==8.0.5
|
20 |
+
pytesseract==0.3.13
|
21 |
python-dotenv==1.0.1
|
22 |
python-multipart==0.0.20
|
23 |
PyYAML==6.0.2
|