Spaces:
Running
Running
Istvan-Adem
commited on
Commit
·
912670d
1
Parent(s):
71a8eab
Make main identical to ai
Browse files- ocr/__init__.py +0 -12
- ocr/api/message/openai_request.py +17 -0
- ocr/api/message/prompts.py +44 -0
- ocr/api/message/utils.py +10 -15
- ocr/api/message/views.py +6 -2
- ocr/core/wrappers.py +30 -0
- requirements.txt +2 -0
ocr/__init__.py
CHANGED
@@ -1,9 +1,6 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
from fastapi import FastAPI
|
4 |
from fastapi.middleware.cors import CORSMiddleware
|
5 |
from starlette.exceptions import HTTPException as StarletteHTTPException
|
6 |
-
from starlette.staticfiles import StaticFiles
|
7 |
|
8 |
from ocr.core.config import settings
|
9 |
from ocr.core.wrappers import OcrResponseWrapper, ErrorOcrResponse
|
@@ -22,15 +19,6 @@ def create_app() -> FastAPI:
|
|
22 |
allow_headers=["*"],
|
23 |
)
|
24 |
|
25 |
-
static_directory = os.path.join(settings.BASE_DIR, 'static')
|
26 |
-
if not os.path.exists(static_directory):
|
27 |
-
os.makedirs(static_directory)
|
28 |
-
|
29 |
-
app.mount(
|
30 |
-
'/static',
|
31 |
-
StaticFiles(directory='static'),
|
32 |
-
)
|
33 |
-
|
34 |
@app.exception_handler(StarletteHTTPException)
|
35 |
async def http_exception_handler(_, exc):
|
36 |
return OcrResponseWrapper(
|
|
|
|
|
|
|
1 |
from fastapi import FastAPI
|
2 |
from fastapi.middleware.cors import CORSMiddleware
|
3 |
from starlette.exceptions import HTTPException as StarletteHTTPException
|
|
|
4 |
|
5 |
from ocr.core.config import settings
|
6 |
from ocr.core.wrappers import OcrResponseWrapper, ErrorOcrResponse
|
|
|
19 |
allow_headers=["*"],
|
20 |
)
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
@app.exception_handler(StarletteHTTPException)
|
23 |
async def http_exception_handler(_, exc):
|
24 |
return OcrResponseWrapper(
|
ocr/api/message/openai_request.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ocr.api.message.prompts import OCRPrompts
|
2 |
+
from ocr.core.wrappers import openai_wrapper
|
3 |
+
|
4 |
+
|
5 |
+
@openai_wrapper(model='gpt-4o-mini')
|
6 |
+
async def generate_report(text: str):
|
7 |
+
messages = [
|
8 |
+
{
|
9 |
+
"role": "system",
|
10 |
+
"content": OCRPrompts.generate_general_answer
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"role": "user",
|
14 |
+
"content": f"Generate a report based on this data:\n\n```\n{text}\n```"
|
15 |
+
}
|
16 |
+
]
|
17 |
+
return messages
|
ocr/api/message/prompts.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class OCRPrompts:
|
2 |
+
generate_general_answer = """## Task
|
3 |
+
|
4 |
+
You must analyze the text extracted from medical document and generate a comprehensive report in **Markdown2** format. Ensure that every detail provided in the document is included, and do not omit or modify any information. Your output must strictly follow the required format.
|
5 |
+
|
6 |
+
## Report Structure
|
7 |
+
|
8 |
+
The report should be structured as follows, with each section containing only relevant information from the document:
|
9 |
+
|
10 |
+
```markdown
|
11 |
+
## Patient Information
|
12 |
+
|
13 |
+
- Name: [Patient Name]
|
14 |
+
- Age: [Patient Age]
|
15 |
+
- Date of Scan: [Date]
|
16 |
+
- Indication: [Reason for the CT scan]
|
17 |
+
|
18 |
+
## Findings
|
19 |
+
|
20 |
+
**Primary findings**:
|
21 |
+
[Describe significant abnormalities or findings relevant to the indication]
|
22 |
+
|
23 |
+
** Secondary findings**:
|
24 |
+
[List incidental findings, e.g., "Mild hepatic steatosis noted."]
|
25 |
+
**No abnormalities**:
|
26 |
+
[Mention organs or systems without abnormalities, e.g., "No evidence of lymphadenopathy or pleural effusion."]
|
27 |
+
|
28 |
+
## Impression
|
29 |
+
|
30 |
+
[Summarize the findings concisely, e.g., "Findings suggest a primary lung tumor. Biopsy recommended for further evaluation."]
|
31 |
+
|
32 |
+
## Recommendations
|
33 |
+
|
34 |
+
[Include next steps or further tests, e.g., "PET scan and consultation with oncology recommended."]
|
35 |
+
```
|
36 |
+
|
37 |
+
[INST]
|
38 |
+
|
39 |
+
## Instructions
|
40 |
+
|
41 |
+
- **Do not invent or infer any information.** Only use data provided in the user request.
|
42 |
+
- Ensure that the format is followed strictly, and the output is complete without any deviations.
|
43 |
+
|
44 |
+
[/INST]"""
|
ocr/api/message/utils.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
-
import base64
|
2 |
import io
|
3 |
import re
|
4 |
|
|
|
|
|
5 |
from pdf2image import convert_from_bytes
|
6 |
|
7 |
|
@@ -16,21 +17,15 @@ def divide_images(contents: bytes) -> list[bytes]:
|
|
16 |
return image_bytes_list
|
17 |
|
18 |
|
19 |
-
def
|
20 |
-
|
21 |
-
{"type": "text", "text": "Generate a report on the attached document"},
|
22 |
-
*[
|
23 |
-
{
|
24 |
-
"type": "image_url",
|
25 |
-
"image_url": {
|
26 |
-
"url": f"data:image/jpeg;base64,{base64.b64encode(image).decode('utf-8')}",
|
27 |
-
},
|
28 |
-
}
|
29 |
-
for image in images
|
30 |
-
]
|
31 |
-
]
|
32 |
-
return content
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
def clean_response(text: str) -> str:
|
36 |
try:
|
|
|
|
|
1 |
import io
|
2 |
import re
|
3 |
|
4 |
+
import pytesseract
|
5 |
+
from PIL import Image
|
6 |
from pdf2image import convert_from_bytes
|
7 |
|
8 |
|
|
|
17 |
return image_bytes_list
|
18 |
|
19 |
|
20 |
+
def extract_text_from_images(images: list[bytes]) -> str:
|
21 |
+
extracted_texts = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
+
for image_bytes in images:
|
24 |
+
image = Image.open(io.BytesIO(image_bytes))
|
25 |
+
text = pytesseract.image_to_string(image)
|
26 |
+
extracted_texts.append(text)
|
27 |
+
|
28 |
+
return '\n'.join(extracted_texts)
|
29 |
|
30 |
def clean_response(text: str) -> str:
|
31 |
try:
|
ocr/api/message/views.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
from fastapi import File, UploadFile
|
2 |
|
3 |
from ocr.api.message import ocr_router
|
|
|
4 |
from ocr.api.message.schemas import OcrResponse
|
5 |
-
from ocr.api.message.utils import divide_images,
|
6 |
from ocr.core.wrappers import OcrResponseWrapper
|
7 |
|
8 |
|
@@ -12,6 +13,9 @@ async def get_all_chat_messages(
|
|
12 |
) -> OcrResponseWrapper[OcrResponse]:
|
13 |
try:
|
14 |
contents = await file.read()
|
15 |
-
|
|
|
|
|
|
|
16 |
finally:
|
17 |
await file.close()
|
|
|
1 |
from fastapi import File, UploadFile
|
2 |
|
3 |
from ocr.api.message import ocr_router
|
4 |
+
from ocr.api.message.openai_request import generate_report
|
5 |
from ocr.api.message.schemas import OcrResponse
|
6 |
+
from ocr.api.message.utils import divide_images, clean_response, extract_text_from_images
|
7 |
from ocr.core.wrappers import OcrResponseWrapper
|
8 |
|
9 |
|
|
|
13 |
) -> OcrResponseWrapper[OcrResponse]:
|
14 |
try:
|
15 |
contents = await file.read()
|
16 |
+
images = divide_images(contents)
|
17 |
+
text_content = extract_text_from_images(images)
|
18 |
+
response = await generate_report(text_content)
|
19 |
+
return OcrResponseWrapper(data=OcrResponse(text=clean_response(response)))
|
20 |
finally:
|
21 |
await file.close()
|
ocr/core/wrappers.py
CHANGED
@@ -1,10 +1,14 @@
|
|
|
|
1 |
from functools import wraps
|
2 |
from typing import Generic, Optional, TypeVar
|
3 |
|
|
|
4 |
from fastapi import HTTPException
|
5 |
from pydantic import BaseModel
|
6 |
from starlette.responses import JSONResponse
|
7 |
|
|
|
|
|
8 |
T = TypeVar('T')
|
9 |
|
10 |
|
@@ -42,6 +46,32 @@ def exception_wrapper(http_error: int, error_message: str):
|
|
42 |
return decorator
|
43 |
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
def background_task():
|
46 |
def decorator(func):
|
47 |
@wraps(func)
|
|
|
1 |
+
import json
|
2 |
from functools import wraps
|
3 |
from typing import Generic, Optional, TypeVar
|
4 |
|
5 |
+
import pydash
|
6 |
from fastapi import HTTPException
|
7 |
from pydantic import BaseModel
|
8 |
from starlette.responses import JSONResponse
|
9 |
|
10 |
+
from ocr.core.config import settings
|
11 |
+
|
12 |
T = TypeVar('T')
|
13 |
|
14 |
|
|
|
46 |
return decorator
|
47 |
|
48 |
|
49 |
+
def openai_wrapper(
|
50 |
+
temperature: int | float = 0, model: str = "gpt-4o-mini", is_json: bool = False, return_: str = None
|
51 |
+
):
|
52 |
+
def decorator(func):
|
53 |
+
@wraps(func)
|
54 |
+
async def wrapper(*args, **kwargs) -> str:
|
55 |
+
messages = await func(*args, **kwargs)
|
56 |
+
completion = await settings.OPENAI_CLIENT.chat.completions.create(
|
57 |
+
messages=messages,
|
58 |
+
temperature=temperature,
|
59 |
+
n=1,
|
60 |
+
model=model,
|
61 |
+
response_format={"type": "json_object"} if is_json else {"type": "text"}
|
62 |
+
)
|
63 |
+
response = completion.choices[0].message.content
|
64 |
+
if is_json:
|
65 |
+
response = json.loads(response)
|
66 |
+
if return_:
|
67 |
+
return pydash.get(response, return_)
|
68 |
+
return response
|
69 |
+
|
70 |
+
return wrapper
|
71 |
+
|
72 |
+
return decorator
|
73 |
+
|
74 |
+
|
75 |
def background_task():
|
76 |
def decorator(func):
|
77 |
@wraps(func)
|
requirements.txt
CHANGED
@@ -11,11 +11,13 @@ httpx==0.28.1
|
|
11 |
idna==3.10
|
12 |
jiter==0.8.2
|
13 |
openai==1.59.9
|
|
|
14 |
pdf2image==1.17.0
|
15 |
pillow==11.1.0
|
16 |
pydantic==2.10.5
|
17 |
pydantic_core==2.27.2
|
18 |
pydash==8.0.5
|
|
|
19 |
python-dotenv==1.0.1
|
20 |
python-multipart==0.0.20
|
21 |
PyYAML==6.0.2
|
|
|
11 |
idna==3.10
|
12 |
jiter==0.8.2
|
13 |
openai==1.59.9
|
14 |
+
packaging==24.2
|
15 |
pdf2image==1.17.0
|
16 |
pillow==11.1.0
|
17 |
pydantic==2.10.5
|
18 |
pydantic_core==2.27.2
|
19 |
pydash==8.0.5
|
20 |
+
pytesseract==0.3.13
|
21 |
python-dotenv==1.0.1
|
22 |
python-multipart==0.0.20
|
23 |
PyYAML==6.0.2
|