Spaces:
Sleeping
Sleeping
add ner
Browse files- ocr/api/message/utils.py +16 -0
- ocr/api/message/views.py +2 -6
- ocr/core/config.py +2 -0
- requirements.txt +65 -0
ocr/api/message/utils.py
CHANGED
@@ -4,8 +4,11 @@ import re
|
|
4 |
|
5 |
import pytesseract
|
6 |
from PIL import Image
|
|
|
7 |
from pdf2image import convert_from_bytes
|
8 |
|
|
|
|
|
9 |
|
10 |
def divide_images(contents: bytes) -> list[bytes]:
|
11 |
images = convert_from_bytes(contents, dpi=250)
|
@@ -49,3 +52,16 @@ def clean_response(text: str) -> str:
|
|
49 |
except Exception as e:
|
50 |
pass
|
51 |
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
import pytesseract
|
6 |
from PIL import Image
|
7 |
+
from flair.data import Sentence
|
8 |
from pdf2image import convert_from_bytes
|
9 |
|
10 |
+
from ocr.core.config import settings
|
11 |
+
|
12 |
|
13 |
def divide_images(contents: bytes) -> list[bytes]:
|
14 |
images = convert_from_bytes(contents, dpi=250)
|
|
|
52 |
except Exception as e:
|
53 |
pass
|
54 |
return text
|
55 |
+
|
56 |
+
|
57 |
+
def clean_text(text: str) -> str:
|
58 |
+
sentence = Sentence(text)
|
59 |
+
settings.TAGGER.predict(sentence)
|
60 |
+
per_entities = [entity for entity in sentence.get_spans('ner') if entity.tag == 'PER']
|
61 |
+
per_entities = sorted(per_entities, key=lambda x: x.start_position, reverse=True)
|
62 |
+
cleaned_text = text
|
63 |
+
for entity in per_entities:
|
64 |
+
start = entity.start_position
|
65 |
+
end = entity.end_position
|
66 |
+
cleaned_text = cleaned_text[:start] + cleaned_text[end:]
|
67 |
+
return cleaned_text
|
ocr/api/message/views.py
CHANGED
@@ -2,7 +2,7 @@ from fastapi import File, UploadFile, HTTPException
|
|
2 |
|
3 |
from ocr.api.message import ocr_router
|
4 |
from ocr.api.message.schemas import OcrResponse
|
5 |
-
from ocr.api.message.utils import divide_images, clean_response, extract_text_from_images
|
6 |
from ocr.core.wrappers import OcrResponseWrapper
|
7 |
|
8 |
|
@@ -21,11 +21,7 @@ async def get_all_chat_messages(
|
|
21 |
else:
|
22 |
raise HTTPException(status_code=400, detail='Unsupported file type.')
|
23 |
text_content = extract_text_from_images(images)
|
24 |
-
|
25 |
-
# extract_original_text(text_content),
|
26 |
-
# generate_report(text_content)
|
27 |
-
# )
|
28 |
-
cleaned_original_text = text_content
|
29 |
return OcrResponseWrapper(data=OcrResponse(text=clean_response(text_content), originalText=cleaned_original_text))
|
30 |
finally:
|
31 |
await file.close()
|
|
|
2 |
|
3 |
from ocr.api.message import ocr_router
|
4 |
from ocr.api.message.schemas import OcrResponse
|
5 |
+
from ocr.api.message.utils import divide_images, clean_response, extract_text_from_images, clean_text
|
6 |
from ocr.core.wrappers import OcrResponseWrapper
|
7 |
|
8 |
|
|
|
21 |
else:
|
22 |
raise HTTPException(status_code=400, detail='Unsupported file type.')
|
23 |
text_content = extract_text_from_images(images)
|
24 |
+
cleaned_original_text = clean_text(text_content)
|
|
|
|
|
|
|
|
|
25 |
return OcrResponseWrapper(data=OcrResponse(text=clean_response(text_content), originalText=cleaned_original_text))
|
26 |
finally:
|
27 |
await file.close()
|
ocr/core/config.py
CHANGED
@@ -3,12 +3,14 @@ import pathlib
|
|
3 |
from functools import lru_cache
|
4 |
|
5 |
from dotenv import load_dotenv
|
|
|
6 |
|
7 |
load_dotenv()
|
8 |
|
9 |
class BaseConfig:
|
10 |
BASE_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent
|
11 |
SECRET_KEY = os.getenv('SECRET')
|
|
|
12 |
|
13 |
class DevelopmentConfig(BaseConfig):
|
14 |
Issuer = "http://localhost:8000"
|
|
|
3 |
from functools import lru_cache
|
4 |
|
5 |
from dotenv import load_dotenv
|
6 |
+
from flair.models import SequenceTagger
|
7 |
|
8 |
load_dotenv()
|
9 |
|
10 |
class BaseConfig:
|
11 |
BASE_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent
|
12 |
SECRET_KEY = os.getenv('SECRET')
|
13 |
+
TAGGER = SequenceTagger.load("flair/ner-english-large")
|
14 |
|
15 |
class DevelopmentConfig(BaseConfig):
|
16 |
Issuer = "http://localhost:8000"
|
requirements.txt
CHANGED
@@ -1,31 +1,96 @@
|
|
|
|
1 |
annotated-types==0.7.0
|
2 |
anyio==4.8.0
|
|
|
|
|
|
|
|
|
|
|
3 |
certifi==2024.12.14
|
|
|
4 |
click==8.1.8
|
|
|
|
|
|
|
|
|
5 |
distro==1.9.0
|
|
|
6 |
fastapi==0.115.6
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
h11==0.14.0
|
8 |
httpcore==1.0.7
|
9 |
httptools==0.6.4
|
10 |
httpx==0.28.1
|
|
|
11 |
idna==3.10
|
|
|
|
|
12 |
jiter==0.8.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
openai==1.59.9
|
14 |
packaging==24.2
|
15 |
pdf2image==1.17.0
|
16 |
pillow==11.1.0
|
|
|
|
|
|
|
17 |
pydantic==2.10.5
|
18 |
pydantic_core==2.27.2
|
19 |
pydash==8.0.5
|
|
|
|
|
20 |
pytesseract==0.3.13
|
|
|
21 |
python-dotenv==1.0.1
|
22 |
python-multipart==0.0.20
|
|
|
23 |
PyYAML==6.0.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
sniffio==1.3.1
|
|
|
|
|
|
|
25 |
starlette==0.41.3
|
|
|
|
|
|
|
|
|
|
|
26 |
tqdm==4.67.1
|
|
|
|
|
27 |
typing_extensions==4.12.2
|
|
|
28 |
uvicorn==0.34.0
|
29 |
uvloop==0.21.0
|
30 |
watchfiles==1.0.4
|
|
|
31 |
websockets==14.2
|
|
|
|
|
|
1 |
+
accelerate==1.4.0
|
2 |
annotated-types==0.7.0
|
3 |
anyio==4.8.0
|
4 |
+
attrs==25.1.0
|
5 |
+
beautifulsoup4==4.13.3
|
6 |
+
bioc==2.1
|
7 |
+
boto3==1.37.4
|
8 |
+
botocore==1.37.4
|
9 |
certifi==2024.12.14
|
10 |
+
charset-normalizer==3.4.1
|
11 |
click==8.1.8
|
12 |
+
conllu==4.5.3
|
13 |
+
contourpy==1.3.1
|
14 |
+
cycler==0.12.1
|
15 |
+
Deprecated==1.2.18
|
16 |
distro==1.9.0
|
17 |
+
docopt==0.6.2
|
18 |
fastapi==0.115.6
|
19 |
+
filelock==3.17.0
|
20 |
+
flair==0.15.1
|
21 |
+
fonttools==4.56.0
|
22 |
+
fsspec==2025.2.0
|
23 |
+
ftfy==6.3.1
|
24 |
+
gdown==5.2.0
|
25 |
h11==0.14.0
|
26 |
httpcore==1.0.7
|
27 |
httptools==0.6.4
|
28 |
httpx==0.28.1
|
29 |
+
huggingface-hub==0.29.1
|
30 |
idna==3.10
|
31 |
+
intervaltree==3.1.0
|
32 |
+
Jinja2==3.1.5
|
33 |
jiter==0.8.2
|
34 |
+
jmespath==1.0.1
|
35 |
+
joblib==1.4.2
|
36 |
+
jsonlines==4.0.0
|
37 |
+
kiwisolver==1.4.8
|
38 |
+
langdetect==1.0.9
|
39 |
+
lxml==5.3.1
|
40 |
+
MarkupSafe==3.0.2
|
41 |
+
matplotlib==3.10.1
|
42 |
+
more-itertools==10.6.0
|
43 |
+
mpld3==0.5.10
|
44 |
+
mpmath==1.3.0
|
45 |
+
networkx==3.4.2
|
46 |
+
numpy==1.26.4
|
47 |
openai==1.59.9
|
48 |
packaging==24.2
|
49 |
pdf2image==1.17.0
|
50 |
pillow==11.1.0
|
51 |
+
pptree==3.1
|
52 |
+
protobuf==5.29.3
|
53 |
+
psutil==7.0.0
|
54 |
pydantic==2.10.5
|
55 |
pydantic_core==2.27.2
|
56 |
pydash==8.0.5
|
57 |
+
pyparsing==3.2.1
|
58 |
+
PySocks==1.7.1
|
59 |
pytesseract==0.3.13
|
60 |
+
python-dateutil==2.9.0.post0
|
61 |
python-dotenv==1.0.1
|
62 |
python-multipart==0.0.20
|
63 |
+
pytorch_revgrad==0.2.0
|
64 |
PyYAML==6.0.2
|
65 |
+
regex==2024.11.6
|
66 |
+
requests==2.32.3
|
67 |
+
s3transfer==0.11.3
|
68 |
+
safetensors==0.5.3
|
69 |
+
scikit-learn==1.6.1
|
70 |
+
scipy==1.15.2
|
71 |
+
segtok==1.5.11
|
72 |
+
sentencepiece==0.2.0
|
73 |
+
setuptools==75.8.2
|
74 |
+
six==1.17.0
|
75 |
sniffio==1.3.1
|
76 |
+
sortedcontainers==2.4.0
|
77 |
+
soupsieve==2.6
|
78 |
+
sqlitedict==2.1.0
|
79 |
starlette==0.41.3
|
80 |
+
sympy==1.13.1
|
81 |
+
tabulate==0.9.0
|
82 |
+
threadpoolctl==3.5.0
|
83 |
+
tokenizers==0.21.0
|
84 |
+
torch==2.6.0
|
85 |
tqdm==4.67.1
|
86 |
+
transformer-smaller-training-vocab==0.4.0
|
87 |
+
transformers==4.49.0
|
88 |
typing_extensions==4.12.2
|
89 |
+
urllib3==2.3.0
|
90 |
uvicorn==0.34.0
|
91 |
uvloop==0.21.0
|
92 |
watchfiles==1.0.4
|
93 |
+
wcwidth==0.2.13
|
94 |
websockets==14.2
|
95 |
+
Wikipedia-API==0.8.1
|
96 |
+
wrapt==1.17.2
|