brestok commited on
Commit
150c3f8
·
1 Parent(s): 66dc64c
ocr/api/message/utils.py CHANGED
@@ -4,8 +4,11 @@ import re
4
 
5
  import pytesseract
6
  from PIL import Image
 
7
  from pdf2image import convert_from_bytes
8
 
 
 
9
 
10
  def divide_images(contents: bytes) -> list[bytes]:
11
  images = convert_from_bytes(contents, dpi=250)
@@ -49,3 +52,16 @@ def clean_response(text: str) -> str:
49
  except Exception as e:
50
  pass
51
  return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  import pytesseract
6
  from PIL import Image
7
+ from flair.data import Sentence
8
  from pdf2image import convert_from_bytes
9
 
10
+ from ocr.core.config import settings
11
+
12
 
13
  def divide_images(contents: bytes) -> list[bytes]:
14
  images = convert_from_bytes(contents, dpi=250)
 
52
  except Exception as e:
53
  pass
54
  return text
55
+
56
+
57
+ def clean_text(text: str) -> str:
58
+ sentence = Sentence(text)
59
+ settings.TAGGER.predict(sentence)
60
+ per_entities = [entity for entity in sentence.get_spans('ner') if entity.tag == 'PER']
61
+ per_entities = sorted(per_entities, key=lambda x: x.start_position, reverse=True)
62
+ cleaned_text = text
63
+ for entity in per_entities:
64
+ start = entity.start_position
65
+ end = entity.end_position
66
+ cleaned_text = cleaned_text[:start] + cleaned_text[end:]
67
+ return cleaned_text
ocr/api/message/views.py CHANGED
@@ -2,7 +2,7 @@ from fastapi import File, UploadFile, HTTPException
2
 
3
  from ocr.api.message import ocr_router
4
  from ocr.api.message.schemas import OcrResponse
5
- from ocr.api.message.utils import divide_images, clean_response, extract_text_from_images
6
  from ocr.core.wrappers import OcrResponseWrapper
7
 
8
 
@@ -21,11 +21,7 @@ async def get_all_chat_messages(
21
  else:
22
  raise HTTPException(status_code=400, detail='Unsupported file type.')
23
  text_content = extract_text_from_images(images)
24
- # original_text, response = await asyncio.gather(
25
- # extract_original_text(text_content),
26
- # generate_report(text_content)
27
- # )
28
- cleaned_original_text = text_content
29
  return OcrResponseWrapper(data=OcrResponse(text=clean_response(text_content), originalText=cleaned_original_text))
30
  finally:
31
  await file.close()
 
2
 
3
  from ocr.api.message import ocr_router
4
  from ocr.api.message.schemas import OcrResponse
5
+ from ocr.api.message.utils import divide_images, clean_response, extract_text_from_images, clean_text
6
  from ocr.core.wrappers import OcrResponseWrapper
7
 
8
 
 
21
  else:
22
  raise HTTPException(status_code=400, detail='Unsupported file type.')
23
  text_content = extract_text_from_images(images)
24
+ cleaned_original_text = clean_text(text_content)
 
 
 
 
25
  return OcrResponseWrapper(data=OcrResponse(text=clean_response(text_content), originalText=cleaned_original_text))
26
  finally:
27
  await file.close()
ocr/core/config.py CHANGED
@@ -3,12 +3,14 @@ import pathlib
3
  from functools import lru_cache
4
 
5
  from dotenv import load_dotenv
 
6
 
7
  load_dotenv()
8
 
9
  class BaseConfig:
10
  BASE_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent
11
  SECRET_KEY = os.getenv('SECRET')
 
12
 
13
  class DevelopmentConfig(BaseConfig):
14
  Issuer = "http://localhost:8000"
 
3
  from functools import lru_cache
4
 
5
  from dotenv import load_dotenv
6
+ from flair.models import SequenceTagger
7
 
8
  load_dotenv()
9
 
10
  class BaseConfig:
11
  BASE_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent
12
  SECRET_KEY = os.getenv('SECRET')
13
+ TAGGER = SequenceTagger.load("flair/ner-english-large")
14
 
15
  class DevelopmentConfig(BaseConfig):
16
  Issuer = "http://localhost:8000"
requirements.txt CHANGED
@@ -1,31 +1,96 @@
 
1
  annotated-types==0.7.0
2
  anyio==4.8.0
 
 
 
 
 
3
  certifi==2024.12.14
 
4
  click==8.1.8
 
 
 
 
5
  distro==1.9.0
 
6
  fastapi==0.115.6
 
 
 
 
 
 
7
  h11==0.14.0
8
  httpcore==1.0.7
9
  httptools==0.6.4
10
  httpx==0.28.1
 
11
  idna==3.10
 
 
12
  jiter==0.8.2
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  openai==1.59.9
14
  packaging==24.2
15
  pdf2image==1.17.0
16
  pillow==11.1.0
 
 
 
17
  pydantic==2.10.5
18
  pydantic_core==2.27.2
19
  pydash==8.0.5
 
 
20
  pytesseract==0.3.13
 
21
  python-dotenv==1.0.1
22
  python-multipart==0.0.20
 
23
  PyYAML==6.0.2
 
 
 
 
 
 
 
 
 
 
24
  sniffio==1.3.1
 
 
 
25
  starlette==0.41.3
 
 
 
 
 
26
  tqdm==4.67.1
 
 
27
  typing_extensions==4.12.2
 
28
  uvicorn==0.34.0
29
  uvloop==0.21.0
30
  watchfiles==1.0.4
 
31
  websockets==14.2
 
 
 
1
+ accelerate==1.4.0
2
  annotated-types==0.7.0
3
  anyio==4.8.0
4
+ attrs==25.1.0
5
+ beautifulsoup4==4.13.3
6
+ bioc==2.1
7
+ boto3==1.37.4
8
+ botocore==1.37.4
9
  certifi==2024.12.14
10
+ charset-normalizer==3.4.1
11
  click==8.1.8
12
+ conllu==4.5.3
13
+ contourpy==1.3.1
14
+ cycler==0.12.1
15
+ Deprecated==1.2.18
16
  distro==1.9.0
17
+ docopt==0.6.2
18
  fastapi==0.115.6
19
+ filelock==3.17.0
20
+ flair==0.15.1
21
+ fonttools==4.56.0
22
+ fsspec==2025.2.0
23
+ ftfy==6.3.1
24
+ gdown==5.2.0
25
  h11==0.14.0
26
  httpcore==1.0.7
27
  httptools==0.6.4
28
  httpx==0.28.1
29
+ huggingface-hub==0.29.1
30
  idna==3.10
31
+ intervaltree==3.1.0
32
+ Jinja2==3.1.5
33
  jiter==0.8.2
34
+ jmespath==1.0.1
35
+ joblib==1.4.2
36
+ jsonlines==4.0.0
37
+ kiwisolver==1.4.8
38
+ langdetect==1.0.9
39
+ lxml==5.3.1
40
+ MarkupSafe==3.0.2
41
+ matplotlib==3.10.1
42
+ more-itertools==10.6.0
43
+ mpld3==0.5.10
44
+ mpmath==1.3.0
45
+ networkx==3.4.2
46
+ numpy==1.26.4
47
  openai==1.59.9
48
  packaging==24.2
49
  pdf2image==1.17.0
50
  pillow==11.1.0
51
+ pptree==3.1
52
+ protobuf==5.29.3
53
+ psutil==7.0.0
54
  pydantic==2.10.5
55
  pydantic_core==2.27.2
56
  pydash==8.0.5
57
+ pyparsing==3.2.1
58
+ PySocks==1.7.1
59
  pytesseract==0.3.13
60
+ python-dateutil==2.9.0.post0
61
  python-dotenv==1.0.1
62
  python-multipart==0.0.20
63
+ pytorch_revgrad==0.2.0
64
  PyYAML==6.0.2
65
+ regex==2024.11.6
66
+ requests==2.32.3
67
+ s3transfer==0.11.3
68
+ safetensors==0.5.3
69
+ scikit-learn==1.6.1
70
+ scipy==1.15.2
71
+ segtok==1.5.11
72
+ sentencepiece==0.2.0
73
+ setuptools==75.8.2
74
+ six==1.17.0
75
  sniffio==1.3.1
76
+ sortedcontainers==2.4.0
77
+ soupsieve==2.6
78
+ sqlitedict==2.1.0
79
  starlette==0.41.3
80
+ sympy==1.13.1
81
+ tabulate==0.9.0
82
+ threadpoolctl==3.5.0
83
+ tokenizers==0.21.0
84
+ torch==2.6.0
85
  tqdm==4.67.1
86
+ transformer-smaller-training-vocab==0.4.0
87
+ transformers==4.49.0
88
  typing_extensions==4.12.2
89
+ urllib3==2.3.0
90
  uvicorn==0.34.0
91
  uvloop==0.21.0
92
  watchfiles==1.0.4
93
+ wcwidth==0.2.13
94
  websockets==14.2
95
+ Wikipedia-API==0.8.1
96
+ wrapt==1.17.2