KevinHuSh commited on
Commit
3cefaa0
·
1 Parent(s): b0577d6

enlarge docker memory usage (#501)

Browse files

### What problem does this PR solve?

### Type of change

- [x] Refactoring

deepdoc/parser/pdf_parser.py CHANGED
@@ -11,7 +11,7 @@ import pdfplumber
11
  import logging
12
  from PIL import Image, ImageDraw
13
  import numpy as np
14
-
15
  from PyPDF2 import PdfReader as pdf2_read
16
 
17
  from api.utils.file_utils import get_project_base_directory
@@ -936,6 +936,7 @@ class HuParser:
936
  self.page_cum_height = [0]
937
  self.page_layout = []
938
  self.page_from = page_from
 
939
  try:
940
  self.pdf = pdfplumber.open(fnm) if isinstance(
941
  fnm, str) else pdfplumber.open(BytesIO(fnm))
@@ -989,7 +990,9 @@ class HuParser:
989
  self.is_english = True
990
  else:
991
  self.is_english = False
 
992
 
 
993
  for i, img in enumerate(self.page_images):
994
  chars = self.page_chars[i] if not self.is_english else []
995
  self.mean_height.append(
@@ -1007,15 +1010,11 @@ class HuParser:
1007
  chars[j]["width"]) / 2:
1008
  chars[j]["text"] += " "
1009
  j += 1
1010
- # if i > 0:
1011
- # if not chars:
1012
- # self.page_cum_height.append(img.size[1] / zoomin)
1013
- # else:
1014
- # self.page_cum_height.append(
1015
- # np.max([c["bottom"] for c in chars]))
1016
  self.__ocr(i + 1, img, chars, zoomin)
1017
- if callback:
1018
- callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
 
1019
 
1020
  if not self.is_english and not any(
1021
  [c for c in self.page_chars]) and self.boxes:
 
11
  import logging
12
  from PIL import Image, ImageDraw
13
  import numpy as np
14
+ from timeit import default_timer as timer
15
  from PyPDF2 import PdfReader as pdf2_read
16
 
17
  from api.utils.file_utils import get_project_base_directory
 
936
  self.page_cum_height = [0]
937
  self.page_layout = []
938
  self.page_from = page_from
939
+ st = timer()
940
  try:
941
  self.pdf = pdfplumber.open(fnm) if isinstance(
942
  fnm, str) else pdfplumber.open(BytesIO(fnm))
 
990
  self.is_english = True
991
  else:
992
  self.is_english = False
993
+ self.is_english = False
994
 
995
+ st = timer()
996
  for i, img in enumerate(self.page_images):
997
  chars = self.page_chars[i] if not self.is_english else []
998
  self.mean_height.append(
 
1010
  chars[j]["width"]) / 2:
1011
  chars[j]["text"] += " "
1012
  j += 1
1013
+
 
 
 
 
 
1014
  self.__ocr(i + 1, img, chars, zoomin)
1015
+ #if callback:
1016
+ # callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
1017
+ #print("OCR:", timer()-st)
1018
 
1019
  if not self.is_english and not any(
1020
  [c for c in self.page_chars]) and self.boxes:
docker/.env CHANGED
@@ -11,7 +11,9 @@ ES_PORT=1200
11
  KIBANA_PORT=6601
12
 
13
  # Increase or decrease based on the available host memory (in bytes)
14
- MEM_LIMIT=12073741824
 
 
15
 
16
  MYSQL_PASSWORD=infini_rag_flow
17
  MYSQL_PORT=5455
 
11
  KIBANA_PORT=6601
12
 
13
  # Increase or decrease based on the available host memory (in bytes)
14
+
15
+ MEM_LIMIT=8073741824
16
+
17
 
18
  MYSQL_PASSWORD=infini_rag_flow
19
  MYSQL_PORT=5455
docker/docker-compose-base.yml CHANGED
@@ -29,23 +29,23 @@ services:
29
  - ragflow
30
  restart: always
31
 
32
- kibana:
33
- depends_on:
34
- es01:
35
- condition: service_healthy
36
- image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
37
- container_name: ragflow-kibana
38
- volumes:
39
- - kibanadata:/usr/share/kibana/data
40
- ports:
41
- - ${KIBANA_PORT}:5601
42
- environment:
43
- - SERVERNAME=kibana
44
- - ELASTICSEARCH_HOSTS=http://es01:9200
45
- - TZ=${TIMEZONE}
46
- mem_limit: ${MEM_LIMIT}
47
- networks:
48
- - ragflow
49
 
50
  mysql:
51
  image: mysql:5.7.18
 
29
  - ragflow
30
  restart: always
31
 
32
+ #kibana:
33
+ # depends_on:
34
+ # es01:
35
+ # condition: service_healthy
36
+ # image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
37
+ # container_name: ragflow-kibana
38
+ # volumes:
39
+ # - kibanadata:/usr/share/kibana/data
40
+ # ports:
41
+ # - ${KIBANA_PORT}:5601
42
+ # environment:
43
+ # - SERVERNAME=kibana
44
+ # - ELASTICSEARCH_HOSTS=http://es01:9200
45
+ # - TZ=${TIMEZONE}
46
+ # mem_limit: ${MEM_LIMIT}
47
+ # networks:
48
+ # - ragflow
49
 
50
  mysql:
51
  image: mysql:5.7.18
docker/entrypoint.sh CHANGED
@@ -29,7 +29,7 @@ function task_bro(){
29
 
30
  task_bro &
31
 
32
- WS=2
33
  for ((i=0;i<WS;i++))
34
  do
35
  task_exe $i $WS &
 
29
 
30
  task_bro &
31
 
32
+ WS=1
33
  for ((i=0;i<WS;i++))
34
  do
35
  task_exe $i $WS &
rag/app/book.py CHANGED
@@ -37,7 +37,7 @@ class Pdf(PdfParser):
37
  start = timer()
38
  self._layouts_rec(zoomin)
39
  callback(0.67, "Layout analysis finished")
40
- print("paddle layouts:", timer() - start)
41
  self._table_transformer_job(zoomin)
42
  callback(0.68, "Table analysis finished")
43
  self._text_merge()
 
37
  start = timer()
38
  self._layouts_rec(zoomin)
39
  callback(0.67, "Layout analysis finished")
40
+ print("layouts:", timer() - start)
41
  self._table_transformer_job(zoomin)
42
  callback(0.68, "Table analysis finished")
43
  self._text_merge()
rag/app/laws.py CHANGED
@@ -71,7 +71,7 @@ class Pdf(PdfParser):
71
  start = timer()
72
  self._layouts_rec(zoomin)
73
  callback(0.67, "Layout analysis finished")
74
- cron_logger.info("paddle layouts:".format(
75
  (timer() - start) / (self.total_page + 0.1)))
76
  self._naive_vertical_merge()
77
 
 
71
  start = timer()
72
  self._layouts_rec(zoomin)
73
  callback(0.67, "Layout analysis finished")
74
+ cron_logger.info("layouts:".format(
75
  (timer() - start) / (self.total_page + 0.1)))
76
  self._naive_vertical_merge()
77
 
rag/app/manual.py CHANGED
@@ -32,7 +32,7 @@ class Pdf(PdfParser):
32
 
33
  self._layouts_rec(zoomin)
34
  callback(0.65, "Layout analysis finished.")
35
- print("paddle layouts:", timer() - start)
36
  self._table_transformer_job(zoomin)
37
  callback(0.67, "Table analysis finished.")
38
  self._text_merge()
 
32
 
33
  self._layouts_rec(zoomin)
34
  callback(0.65, "Layout analysis finished.")
35
+ print("layouts:", timer() - start)
36
  self._table_transformer_job(zoomin)
37
  callback(0.67, "Table analysis finished.")
38
  self._text_merge()
rag/app/naive.py CHANGED
@@ -77,12 +77,12 @@ class Pdf(PdfParser):
77
  callback
78
  )
79
  callback(msg="OCR finished")
80
- cron_logger.info("OCR: {}".format(timer() - start))
81
 
82
  start = timer()
83
  self._layouts_rec(zoomin)
84
  callback(0.63, "Layout analysis finished.")
85
- print("paddle layouts:", timer() - start)
86
  self._table_transformer_job(zoomin)
87
  callback(0.65, "Table analysis finished.")
88
  self._text_merge()
@@ -92,7 +92,7 @@ class Pdf(PdfParser):
92
  self._concat_downward()
93
  #self._filter_forpages()
94
 
95
- cron_logger.info("paddle layouts: {}".format(
96
  (timer() - start) / (self.total_page + 0.1)))
97
  return [(b["text"], self._line_tag(b, zoomin))
98
  for b in self.boxes], tbls
 
77
  callback
78
  )
79
  callback(msg="OCR finished")
80
+ cron_logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
81
 
82
  start = timer()
83
  self._layouts_rec(zoomin)
84
  callback(0.63, "Layout analysis finished.")
85
+ print("layouts:", timer() - start)
86
  self._table_transformer_job(zoomin)
87
  callback(0.65, "Table analysis finished.")
88
  self._text_merge()
 
92
  self._concat_downward()
93
  #self._filter_forpages()
94
 
95
+ cron_logger.info("layouts: {}".format(
96
  (timer() - start) / (self.total_page + 0.1)))
97
  return [(b["text"], self._line_tag(b, zoomin))
98
  for b in self.boxes], tbls
rag/app/one.py CHANGED
@@ -33,7 +33,7 @@ class Pdf(PdfParser):
33
  start = timer()
34
  self._layouts_rec(zoomin, drop=False)
35
  callback(0.63, "Layout analysis finished.")
36
- print("paddle layouts:", timer() - start)
37
  self._table_transformer_job(zoomin)
38
  callback(0.65, "Table analysis finished.")
39
  self._text_merge()
 
33
  start = timer()
34
  self._layouts_rec(zoomin, drop=False)
35
  callback(0.63, "Layout analysis finished.")
36
+ print("layouts:", timer() - start)
37
  self._table_transformer_job(zoomin)
38
  callback(0.65, "Table analysis finished.")
39
  self._text_merge()
rag/app/paper.py CHANGED
@@ -42,7 +42,7 @@ class Pdf(PdfParser):
42
  start = timer()
43
  self._layouts_rec(zoomin)
44
  callback(0.63, "Layout analysis finished")
45
- print("paddle layouts:", timer() - start)
46
  self._table_transformer_job(zoomin)
47
  callback(0.68, "Table analysis finished")
48
  self._text_merge()
 
42
  start = timer()
43
  self._layouts_rec(zoomin)
44
  callback(0.63, "Layout analysis finished")
45
+ print("layouts:", timer() - start)
46
  self._table_transformer_job(zoomin)
47
  callback(0.68, "Table analysis finished")
48
  self._text_merge()
rag/svr/task_broker.py CHANGED
@@ -33,6 +33,8 @@ from api.settings import database_logger
33
  from api.utils import get_format_time, get_uuid
34
  from api.utils.file_utils import get_project_base_directory
35
  from rag.utils.redis_conn import REDIS_CONN
 
 
36
 
37
 
38
  def collect(tm):
@@ -181,6 +183,9 @@ if __name__ == "__main__":
181
  peewee_logger.propagate = False
182
  peewee_logger.addHandler(database_logger.handlers[0])
183
  peewee_logger.setLevel(database_logger.level)
 
 
 
184
 
185
  while True:
186
  dispatch()
 
33
  from api.utils import get_format_time, get_uuid
34
  from api.utils.file_utils import get_project_base_directory
35
  from rag.utils.redis_conn import REDIS_CONN
36
+ from api.db.db_models import init_database_tables as init_web_db
37
+ from api.db.init_data import init_web_data
38
 
39
 
40
  def collect(tm):
 
183
  peewee_logger.propagate = False
184
  peewee_logger.addHandler(database_logger.handlers[0])
185
  peewee_logger.setLevel(database_logger.level)
186
+ # init db
187
+ init_web_db()
188
+ init_web_data()
189
 
190
  while True:
191
  dispatch()
rag/svr/task_executor.py CHANGED
@@ -163,6 +163,7 @@ def build(row):
163
  "doc_id": row["doc_id"],
164
  "kb_id": [str(row["kb_id"])]
165
  }
 
166
  for ck in cks:
167
  d = copy.deepcopy(doc)
168
  d.update(ck)
@@ -182,10 +183,13 @@ def build(row):
182
  else:
183
  d["image"].save(output_buffer, format='JPEG')
184
 
 
185
  MINIO.put(row["kb_id"], d["_id"], output_buffer.getvalue())
 
186
  d["img_id"] = "{}-{}".format(row["kb_id"], d["_id"])
187
  del d["image"]
188
  docs.append(d)
 
189
 
190
  return docs
191
 
@@ -258,7 +262,9 @@ def main(comm, mod):
258
  callback(prog=-1, msg=str(e))
259
  continue
260
 
 
261
  cks = build(r)
 
262
  if cks is None:
263
  continue
264
  if not cks:
@@ -277,12 +283,14 @@ def main(comm, mod):
277
  callback(-1, "Embedding error:{}".format(str(e)))
278
  cron_logger.error(str(e))
279
  tk_count = 0
 
280
 
281
  callback(msg="Finished embedding({})! Start to build index!".format(timer()-st))
282
  init_kb(r)
283
  chunk_count = len(set([c["_id"] for c in cks]))
284
  st = timer()
285
  es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
 
286
  if es_r:
287
  callback(-1, "Index failure!")
288
  ELASTICSEARCH.deleteByQuery(
 
163
  "doc_id": row["doc_id"],
164
  "kb_id": [str(row["kb_id"])]
165
  }
166
+ el = 0
167
  for ck in cks:
168
  d = copy.deepcopy(doc)
169
  d.update(ck)
 
183
  else:
184
  d["image"].save(output_buffer, format='JPEG')
185
 
186
+ st = timer()
187
  MINIO.put(row["kb_id"], d["_id"], output_buffer.getvalue())
188
+ el += timer() - st
189
  d["img_id"] = "{}-{}".format(row["kb_id"], d["_id"])
190
  del d["image"]
191
  docs.append(d)
192
+ cron_logger.info("MINIO PUT({}):{}".format(row["name"], el))
193
 
194
  return docs
195
 
 
262
  callback(prog=-1, msg=str(e))
263
  continue
264
 
265
+ st = timer()
266
  cks = build(r)
267
+ cron_logger.info("Build chunks({}): {}".format(r["name"], timer()-st))
268
  if cks is None:
269
  continue
270
  if not cks:
 
283
  callback(-1, "Embedding error:{}".format(str(e)))
284
  cron_logger.error(str(e))
285
  tk_count = 0
286
+ cron_logger.info("Embedding elapsed({}): {}".format(r["name"], timer()-st))
287
 
288
  callback(msg="Finished embedding({})! Start to build index!".format(timer()-st))
289
  init_kb(r)
290
  chunk_count = len(set([c["_id"] for c in cks]))
291
  st = timer()
292
  es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
293
+ cron_logger.info("Indexing elapsed({}): {}".format(r["name"], timer()-st))
294
  if es_r:
295
  callback(-1, "Index failure!")
296
  ELASTICSEARCH.deleteByQuery(