KevinHuSh commited on
Commit
738c322
·
1 Parent(s): f4456af

add docker compose (#8)

Browse files

* add docker compose

* add docker compose

docker/docker-compose.yml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '2.2'
2
+ services:
3
+ es01:
4
+ container_name: docass-es-01
5
+ image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
6
+ volumes:
7
+ - esdata01:/usr/share/elasticsearch/data
8
+ ports:
9
+ - ${ES_PORT}:9200
10
+ environment:
11
+ - node.name=es01
12
+ - cluster.name=${CLUSTER_NAME}
13
+ - cluster.initial_master_nodes=es01
14
+ - ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
15
+ - bootstrap.memory_lock=false
16
+ - xpack.security.enabled=false
17
+ mem_limit: ${MEM_LIMIT}
18
+ ulimits:
19
+ memlock:
20
+ soft: -1
21
+ hard: -1
22
+ networks:
23
+ - docass
24
+ restart: always
25
+
26
+ kibana:
27
+ depends_on:
28
+ - es01
29
+ image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
30
+ container_name: docass-kibana
31
+ volumes:
32
+ - kibanadata:/usr/share/kibana/data
33
+ ports:
34
+ - ${KIBANA_PORT}:5601
35
+ environment:
36
+ - SERVERNAME=kibana
37
+ - ELASTICSEARCH_HOSTS=http://es01:9200
38
+ mem_limit: ${MEM_LIMIT}
39
+ networks:
40
+ - docass
41
+
42
+ postgres:
43
+ image: postgres
44
+ container_name: docass-postgres
45
+ environment:
46
+ - POSTGRES_USER=${POSTGRES_USER}
47
+ - POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
48
+ - POSTGRES_DB=${POSTGRES_DB}
49
+ ports:
50
+ - 5455:5455
51
+ volumes:
52
+ - pg_data:/usr/share/elasticsearch/data
53
+ networks:
54
+ - docass
55
+ restart: always
56
+
57
+
58
+ volumes:
59
+ esdata01:
60
+ driver: local
61
+ kibanadata:
62
+ driver: local
63
+ pg_data:
64
+ driver: local
65
+
66
+ networks:
67
+ docass:
68
+ driver: bridge
python/README.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ```shell
3
+
4
+ docker pull postgres
5
+
6
+ LOCAL_POSTGRES_DATA=./postgres-data
7
+
8
+ docker run
9
+ --name docass-postgres
10
+ -p 5455:5432
11
+ -v $LOCAL_POSTGRES_DATA:/var/lib/postgresql/data
12
+ -e POSTGRES_USER=root
13
+ -e POSTGRES_PASSWORD=infiniflow_docass
14
+ -e POSTGRES_DB=docass
15
+ -d
16
+ postgres
17
+
18
+ docker network create elastic
19
+ docker pull elasticsearch:8.11.3;
20
+ docker pull docker.elastic.co/kibana/kibana:8.11.3
21
+
22
+ ```
python/conf/sys.cnf CHANGED
@@ -1,4 +1,8 @@
1
  [online]
2
  es=127.0.0.1:9200
3
  idx_nm=toxic
 
 
 
 
4
 
 
1
  [online]
2
  es=127.0.0.1:9200
3
  idx_nm=toxic
4
+ pgdb_usr=root
5
+ pgdb_pwd=infiniflow_docass
6
+ pgdb_host=127.0.0.1
7
+ pgdb_port=5432
8
 
python/nlp/huchunk.py CHANGED
@@ -291,6 +291,12 @@ class PdfChunker(HuChunker):
291
 
292
 
293
  class DocxChunker(HuChunker):
 
 
 
 
 
 
294
  def __init__(self, doc_parser):
295
  self.doc = doc_parser
296
  super().__init__()
@@ -336,6 +342,12 @@ class DocxChunker(HuChunker):
336
 
337
 
338
  class ExcelChunker(HuChunker):
 
 
 
 
 
 
339
  def __init__(self, excel_parser):
340
  self.excel = excel_parser
341
  super().__init__()
@@ -354,10 +366,10 @@ if __name__ == "__main__":
354
  from parser import PdfParser
355
  ckr = PdfChunker(PdfParser())
356
  if sys.argv[1].split(".")[-1].lower().find("doc") >= 0:
357
- from .parser import DocxParser
358
  ckr = DocxChunker(DocxParser())
359
  if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0:
360
- from .parser import ExcelParser
361
  ckr = ExcelChunker(ExcelParser())
362
 
363
  # ckr.html(sys.argv[1])
 
291
 
292
 
293
  class DocxChunker(HuChunker):
294
+
295
+ @dataclass
296
+ class Fields:
297
+ text_chunks: List = None
298
+ table_chunks: List = None
299
+
300
  def __init__(self, doc_parser):
301
  self.doc = doc_parser
302
  super().__init__()
 
342
 
343
 
344
  class ExcelChunker(HuChunker):
345
+
346
+ @dataclass
347
+ class Fields:
348
+ text_chunks: List = None
349
+ table_chunks: List = None
350
+
351
  def __init__(self, excel_parser):
352
  self.excel = excel_parser
353
  super().__init__()
 
366
  from parser import PdfParser
367
  ckr = PdfChunker(PdfParser())
368
  if sys.argv[1].split(".")[-1].lower().find("doc") >= 0:
369
+ from parser import DocxParser
370
  ckr = DocxChunker(DocxParser())
371
  if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0:
372
+ from parser import ExcelParser
373
  ckr = ExcelChunker(ExcelParser())
374
 
375
  # ckr.html(sys.argv[1])
python/parser/pdf_parser.py CHANGED
@@ -323,7 +323,7 @@ class HuParser:
323
  return layouts
324
 
325
  def __table_paddle(self, images):
326
- tbls = self.tbl_det([np.array(img) for img in images], thr=0.5)
327
  res = []
328
  # align left&right for rows, align top&bottom for columns
329
  for tbl in tbls:
 
323
  return layouts
324
 
325
  def __table_paddle(self, images):
326
+ tbls = self.tbl_det([img for img in images], threshold=0.5)
327
  res = []
328
  # align left&right for rows, align top&bottom for columns
329
  for tbl in tbls:
python/util/db_conn.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import time
3
+ from util import config
4
+ import pandas as pd
5
+
6
+ class Postgre(object):
7
+ def __init__(self, env, dbnm):
8
+ self.config = config.init(env)
9
+ self.conn = None
10
+ self.dbnm = dbnm
11
+ self.__open__()
12
+
13
+ def __open__(self):
14
+ import psycopg2
15
+ try:
16
+ if self.conn:self.__close__()
17
+ del self.conn
18
+ except Exception as e:
19
+ pass
20
+
21
+ try:
22
+ self.conn = psycopg2.connect(f"dbname={self.dbnm} user={self.config.get('pgdb_usr')} password={self.config.get('pgdb_pwd')} host={self.config.get('pgdb_host')} port={self.config.get('pgdb_port')}")
23
+ except Exception as e:
24
+ logging.error("Fail to connect %s "%self.config.get("pgdb_host") + str(e))
25
+
26
+
27
+ def __close__(self):
28
+ try:
29
+ self.conn.close()
30
+ except Exception as e:
31
+ logging.error("Fail to close %s "%self.config.get("pgdb_host") + str(e))
32
+
33
+
34
+ def select(self, sql):
35
+ for _ in range(10):
36
+ try:
37
+ return pd.read_sql(sql, self.conn)
38
+ except Exception as e:
39
+ logging.error(f"Fail to exec {sql}l "+str(e))
40
+ self.__open__()
41
+ time.sleep(1)
42
+
43
+ return pd.DataFrame()
44
+