KevinHuSh
commited on
Commit
·
3079197
1
Parent(s):
db8cae3
build python version rag-flow (#21)
Browse files* clean rust version project
* clean rust version project
* build python version rag-flow
This view is limited to 50 files because it contains too many changes.
See raw diff
- .env.template +0 -9
- Cargo.toml +0 -42
- {python/conf → conf}/mapping.json +0 -0
- conf/private.pem +30 -0
- conf/public.pem +9 -0
- conf/service_conf.yaml +28 -0
- docker/.env +0 -21
- docker/docker-compose.yml +30 -17
- docker/init.sql +2 -0
- migration/Cargo.toml +0 -20
- migration/README.md +0 -41
- migration/src/lib.rs +0 -12
- migration/src/m20220101_000001_create_table.rs +0 -440
- migration/src/main.rs +0 -6
- python/Dockerfile +29 -0
- python/README.md +0 -22
- python/{nlp/__init__.py → ToPDF.pdf} +0 -0
- python/] +63 -0
- python/conf/logging.json +0 -41
- python/conf/sys.cnf +0 -9
- python/llm/__init__.py +0 -21
- python/output/ToPDF.pdf +0 -0
- python/requirements.txt +0 -194
- python/res/1-0.tm +8 -0
- python/res/thumbnail-1-0.tm +3 -0
- python/svr/add_thumbnail2file.py +0 -118
- python/svr/dialog_svr.py +0 -165
- python/svr/parse_user_docs.py +0 -258
- python/tmp.log +15 -0
- python/util/config.py +0 -31
- python/util/db_conn.py +0 -70
- python/util/setup_logging.py +0 -36
- rag/__init__.py +0 -0
- rag/llm/__init__.py +32 -0
- {python → rag}/llm/chat_model.py +15 -0
- {python → rag}/llm/cv_model.py +28 -5
- {python → rag}/llm/embedding_model.py +44 -11
- rag/nlp/__init__.py +0 -0
- {python → rag}/nlp/huchunk.py +0 -0
- {python → rag}/nlp/huqie.py +3 -8
- {python → rag}/nlp/query.py +3 -3
- {python → rag}/nlp/search.py +3 -5
- {python → rag}/nlp/synonym.py +7 -10
- {python → rag}/nlp/term_weight.py +6 -6
- {python → rag}/parser/__init__.py +0 -0
- {python → rag}/parser/docx_parser.py +2 -1
- {python → rag}/parser/excel_parser.py +13 -5
- {python → rag}/parser/pdf_parser.py +4 -3
- {python → rag}/res/huqie.txt +0 -0
- {python → rag}/res/ner.json +0 -0
.env.template
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
# Database
|
2 |
-
HOST=127.0.0.1
|
3 |
-
PORT=8000
|
4 |
-
DATABASE_URL="postgresql://infiniflow:infiniflow@localhost/docgpt"
|
5 |
-
|
6 |
-
# S3 Storage
|
7 |
-
MINIO_HOST="127.0.0.1:9000"
|
8 |
-
MINIO_USR="infiniflow"
|
9 |
-
MINIO_PWD="infiniflow_docgpt"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Cargo.toml
DELETED
@@ -1,42 +0,0 @@
|
|
1 |
-
[package]
|
2 |
-
name = "doc_gpt"
|
3 |
-
version = "0.1.0"
|
4 |
-
edition = "2021"
|
5 |
-
|
6 |
-
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
7 |
-
|
8 |
-
[dependencies]
|
9 |
-
actix-web = "4.3.1"
|
10 |
-
actix-rt = "2.8.0"
|
11 |
-
actix-files = "0.6.2"
|
12 |
-
actix-multipart = "0.4"
|
13 |
-
actix-session = { version = "0.5" }
|
14 |
-
actix-identity = { version = "0.4" }
|
15 |
-
actix-web-httpauth = { version = "0.6" }
|
16 |
-
actix-ws = "0.2.5"
|
17 |
-
uuid = { version = "1.6.1", features = [
|
18 |
-
"v4",
|
19 |
-
"fast-rng",
|
20 |
-
"macro-diagnostics",
|
21 |
-
] }
|
22 |
-
thiserror = "1.0"
|
23 |
-
postgres = "0.19.7"
|
24 |
-
sea-orm = { version = "0.12.9", features = ["sqlx-postgres", "runtime-tokio-native-tls", "macros"] }
|
25 |
-
serde = { version = "1", features = ["derive"] }
|
26 |
-
serde_json = "1.0"
|
27 |
-
tracing-subscriber = "0.3.18"
|
28 |
-
dotenvy = "0.15.7"
|
29 |
-
listenfd = "1.0.1"
|
30 |
-
chrono = "0.4.31"
|
31 |
-
migration = { path = "./migration" }
|
32 |
-
minio = "0.1.0"
|
33 |
-
futures-util = "0.3.29"
|
34 |
-
actix-multipart-extract = "0.1.5"
|
35 |
-
regex = "1.10.2"
|
36 |
-
tokio = { version = "1.35.1", features = ["rt", "time", "macros"] }
|
37 |
-
|
38 |
-
[[bin]]
|
39 |
-
name = "doc_gpt"
|
40 |
-
|
41 |
-
[workspace]
|
42 |
-
members = [".", "migration"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{python/conf → conf}/mapping.json
RENAMED
File without changes
|
conf/private.pem
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN RSA PRIVATE KEY-----
|
2 |
+
Proc-Type: 4,ENCRYPTED
|
3 |
+
DEK-Info: DES-EDE3-CBC,EFF8327C41E531AD
|
4 |
+
|
5 |
+
7jdPFDAA6fiTzOIU7XGzKuT324JKZEcK5vBRJqBkA5XO6ENN1wLdhh3zQbl1Ejfv
|
6 |
+
KMSUIgbtQEJB4bvOzS//okbZa1vCNYuTS/NGcpKUnhqdOmAL3hl/kOtOLLjTZrwo
|
7 |
+
3KX8iujLH7wQ64GxArtpUuaFq1k0whN1BB5RGJp3IO/L6pMpSWVRKO+JPUrD1Ujr
|
8 |
+
XA/LUKQJaZtXVUVOYPtIwbyqPsh93QBetJnRwwV3gNOwGpcX2jDpyTxDUkLJCPPg
|
9 |
+
6Hw0pwlQEd8A11sjxCBbASwLeJO1L0w69QiX9chyOkZ+sfDsVpPt/wf1NexA7Cdj
|
10 |
+
9uifJ4JGbby39QD6mInZGtnRzQRdafjuXlBR2I0Qa7fBRu8QsfhmLbWZfWno7j08
|
11 |
+
4bAAoqB1vRNfSu8LVJXdEEh/HKuwu11pgRr5eH8WQ3hJg+Y2k7zDHpp1VaHL7/Kn
|
12 |
+
S+aN5bhQ4Xt0Ujdi1+rsmNchnF6LWsDezHWJeWUM6X7dJnqIBl8oCyghbghT8Tyw
|
13 |
+
aEKWXc2+7FsP5yd0NfG3PFYOLdLgfI43pHTAv5PEQ47w9r1XOwfblKKBUDEzaput
|
14 |
+
T3t5wQ6wxdyhRxeO4arCHfe/i+j3fzvhlwgbuwrmrkWGWSS86eMTaoGM8+uUrHv0
|
15 |
+
6TbU0tj6DKKUslVk1dCHh9TnmNsXZuLJkceZF38PSKNxhzudU8OTtzhS0tFL91HX
|
16 |
+
vo7N+XdiGMs8oOSpjE6RPlhFhVAKGJpXwBj/vXLLcmzesA7ZB2kYtFKMIdsUQpls
|
17 |
+
PE/4K5PEX2d8pxA5zxo0HleA1YjW8i5WEcDQThZQzj2sWvg06zSjenVFrbCm9Bro
|
18 |
+
hFpAB/3zJHxdRN2MpNpvK35WITy1aDUdX1WdyrlcRtIE5ssFTSoxSj9ibbDZ78+z
|
19 |
+
gtbw/MUi6vU6Yz1EjvoYu/bmZAHt9Aagcxw6k58fjO2cEB9njK7xbbiZUSwpJhEe
|
20 |
+
U/PxK+SdOU/MmGKeqdgqSfhJkq0vhacvsEjFGRAfivSCHkL0UjhObU+rSJ3g1RMO
|
21 |
+
oukAev6TOAwbTKVWjg3/EX+pl/zorAgaPNYFX64TSH4lE3VjeWApITb9Z5C/sVxR
|
22 |
+
xW6hU9qyjzWYWY+91y16nkw1l7VQvWHUZwV7QzTScC2BOzDVpeqY1KiYJxgoo6sX
|
23 |
+
ZCqR5oh4vToG4W8ZrRyauwUaZJ3r+zhAgm+6n6TJQNwFEl0muji+1nPl32EiFsRs
|
24 |
+
qR6CtuhUOVQM4VnILDwFJfuGYRFtKzQgvseLNU4ZqAVqQj8l4ARGAP2P1Au/uUKy
|
25 |
+
oGzI7a+b5MvRHuvkxPAclOgXgX/8yyOLaBg+mgaqv9h2JIJD28PzouFl3BajRaVB
|
26 |
+
7GWTnROJYhX5SuX/g585SLRKoQUtK0WhdJCjTRfyRJPwfdppgdTbWO99R4G+ir02
|
27 |
+
JQdSkZf2vmZRXenPNTEPDOUY6nVN6sUuBjmtOwoUF194ODgpYB6IaHqK08sa1pUh
|
28 |
+
1mZyxitHdPbygePTe20XWMZFoK2knAqN0JPPbbNjCqiVV+7oqQAnkDIutspu9t2m
|
29 |
+
ny3jefFmNozbblQMghLUrq+x9wOEgvS76Sqvq3DG/2BkLzJF3MNkvw==
|
30 |
+
-----END RSA PRIVATE KEY-----
|
conf/public.pem
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN PUBLIC KEY-----
|
2 |
+
MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEArq9XTUSeYr2+N1h3Afl/
|
3 |
+
z8Dse/2yD0ZGrKwx+EEEcdsBLca9Ynmx3nIB5obmLlSfmskLpBo0UACBmB5rEjBp
|
4 |
+
2Q2f3AG3Hjd4B+gNCG6BDaawuDlgANIhGnaTLrIqWrrcm4EMzJOnAOI1fgzJRsOO
|
5 |
+
UEfaS318Eq9OVO3apEyCCt0lOQK6PuksduOjVxtltDav+guVAA068NrPYmRNabVK
|
6 |
+
RNLJpL8w4D44sfth5RvZ3q9t+6RTArpEtc5sh5ChzvqPOzKGMXW83C95TxmXqpbK
|
7 |
+
6olN4RevSfVjEAgCydH6HN6OhtOQEcnrU97r9H0iZOWwbw3pVrZiUkuRD1R56Wzs
|
8 |
+
2wIDAQAB
|
9 |
+
-----END PUBLIC KEY-----
|
conf/service_conf.yaml
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
authentication:
|
2 |
+
client:
|
3 |
+
switch: false
|
4 |
+
http_app_key:
|
5 |
+
http_secret_key:
|
6 |
+
site:
|
7 |
+
switch: false
|
8 |
+
permission:
|
9 |
+
switch: false
|
10 |
+
component: false
|
11 |
+
dataset: false
|
12 |
+
ragflow:
|
13 |
+
# you must set real ip address, 127.0.0.1 and 0.0.0.0 is not supported
|
14 |
+
host: 127.0.0.1
|
15 |
+
http_port: 9380
|
16 |
+
database:
|
17 |
+
name: 'rag_flow'
|
18 |
+
user: 'root'
|
19 |
+
passwd: 'infini_rag_flow'
|
20 |
+
host: '123.60.95.134'
|
21 |
+
port: 5455
|
22 |
+
max_connections: 100
|
23 |
+
stale_timeout: 30
|
24 |
+
oauth:
|
25 |
+
github:
|
26 |
+
client_id: 302129228f0d96055bee
|
27 |
+
secret_key: e518e55ccfcdfcae8996afc40f110e9c95f14fc4
|
28 |
+
url: https://github.com/login/oauth/access_token
|
docker/.env
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
# Version of Elastic products
|
2 |
-
STACK_VERSION=8.11.3
|
3 |
-
|
4 |
-
# Set the cluster name
|
5 |
-
CLUSTER_NAME=docgpt
|
6 |
-
|
7 |
-
# Port to expose Elasticsearch HTTP API to the host
|
8 |
-
ES_PORT=9200
|
9 |
-
|
10 |
-
# Port to expose Kibana to the host
|
11 |
-
KIBANA_PORT=6601
|
12 |
-
|
13 |
-
# Increase or decrease based on the available host memory (in bytes)
|
14 |
-
MEM_LIMIT=4073741824
|
15 |
-
|
16 |
-
POSTGRES_USER=root
|
17 |
-
POSTGRES_PASSWORD=infiniflow_docgpt
|
18 |
-
POSTGRES_DB=docgpt
|
19 |
-
|
20 |
-
MINIO_USER=infiniflow
|
21 |
-
MINIO_PASSWORD=infiniflow_docgpt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docker/docker-compose.yml
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
version: '2.2'
|
2 |
services:
|
3 |
es01:
|
4 |
-
container_name:
|
5 |
image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
|
6 |
volumes:
|
7 |
- esdata01:/usr/share/elasticsearch/data
|
@@ -20,14 +20,14 @@ services:
|
|
20 |
soft: -1
|
21 |
hard: -1
|
22 |
networks:
|
23 |
-
-
|
24 |
restart: always
|
25 |
|
26 |
kibana:
|
27 |
depends_on:
|
28 |
- es01
|
29 |
image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
|
30 |
-
container_name:
|
31 |
volumes:
|
32 |
- kibanadata:/usr/share/kibana/data
|
33 |
ports:
|
@@ -37,26 +37,39 @@ services:
|
|
37 |
- ELASTICSEARCH_HOSTS=http://es01:9200
|
38 |
mem_limit: ${MEM_LIMIT}
|
39 |
networks:
|
40 |
-
-
|
41 |
|
42 |
-
|
43 |
-
image:
|
44 |
-
container_name:
|
45 |
environment:
|
46 |
-
-
|
47 |
-
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
ports:
|
50 |
-
-
|
51 |
volumes:
|
52 |
-
-
|
|
|
53 |
networks:
|
54 |
-
-
|
|
|
|
|
|
|
|
|
|
|
55 |
restart: always
|
56 |
|
|
|
57 |
minio:
|
58 |
image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
|
59 |
-
container_name:
|
60 |
command: server --console-address ":9001" /data
|
61 |
ports:
|
62 |
- 9000:9000
|
@@ -67,7 +80,7 @@ services:
|
|
67 |
volumes:
|
68 |
- minio_data:/data
|
69 |
networks:
|
70 |
-
-
|
71 |
restart: always
|
72 |
|
73 |
|
@@ -76,11 +89,11 @@ volumes:
|
|
76 |
driver: local
|
77 |
kibanadata:
|
78 |
driver: local
|
79 |
-
|
80 |
driver: local
|
81 |
minio_data:
|
82 |
driver: local
|
83 |
|
84 |
networks:
|
85 |
-
|
86 |
driver: bridge
|
|
|
1 |
version: '2.2'
|
2 |
services:
|
3 |
es01:
|
4 |
+
container_name: ragflow-es-01
|
5 |
image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
|
6 |
volumes:
|
7 |
- esdata01:/usr/share/elasticsearch/data
|
|
|
20 |
soft: -1
|
21 |
hard: -1
|
22 |
networks:
|
23 |
+
- ragflow
|
24 |
restart: always
|
25 |
|
26 |
kibana:
|
27 |
depends_on:
|
28 |
- es01
|
29 |
image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
|
30 |
+
container_name: ragflow-kibana
|
31 |
volumes:
|
32 |
- kibanadata:/usr/share/kibana/data
|
33 |
ports:
|
|
|
37 |
- ELASTICSEARCH_HOSTS=http://es01:9200
|
38 |
mem_limit: ${MEM_LIMIT}
|
39 |
networks:
|
40 |
+
- ragflow
|
41 |
|
42 |
+
mysql:
|
43 |
+
image: mysql:5.7.18
|
44 |
+
container_name: ragflow-mysql
|
45 |
environment:
|
46 |
+
- MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD}
|
47 |
+
- TZ="Asia/Shanghai"
|
48 |
+
command:
|
49 |
+
--max_connections=1000
|
50 |
+
--character-set-server=utf8mb4
|
51 |
+
--collation-server=utf8mb4_general_ci
|
52 |
+
--default-authentication-plugin=mysql_native_password
|
53 |
+
--tls_version="TLSv1.2,TLSv1.3"
|
54 |
+
--init-file /data/application/init.sql
|
55 |
ports:
|
56 |
+
- ${MYSQL_PORT}:3306
|
57 |
volumes:
|
58 |
+
- mysql_data:/var/lib/mysql
|
59 |
+
- ./init.sql:/data/application/init.sql
|
60 |
networks:
|
61 |
+
- ragflow
|
62 |
+
healthcheck:
|
63 |
+
test: [ "CMD-SHELL", "curl --silent localhost:3306 >/dev/null || exit 1" ]
|
64 |
+
interval: 10s
|
65 |
+
timeout: 10s
|
66 |
+
retries: 3
|
67 |
restart: always
|
68 |
|
69 |
+
|
70 |
minio:
|
71 |
image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
|
72 |
+
container_name: ragflow-minio
|
73 |
command: server --console-address ":9001" /data
|
74 |
ports:
|
75 |
- 9000:9000
|
|
|
80 |
volumes:
|
81 |
- minio_data:/data
|
82 |
networks:
|
83 |
+
- ragflow
|
84 |
restart: always
|
85 |
|
86 |
|
|
|
89 |
driver: local
|
90 |
kibanadata:
|
91 |
driver: local
|
92 |
+
mysql_data:
|
93 |
driver: local
|
94 |
minio_data:
|
95 |
driver: local
|
96 |
|
97 |
networks:
|
98 |
+
ragflow:
|
99 |
driver: bridge
|
docker/init.sql
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
CREATE DATABASE IF NOT EXISTS rag_flow;
|
2 |
+
USE rag_flow;
|
migration/Cargo.toml
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
[package]
|
2 |
-
name = "migration"
|
3 |
-
version = "0.1.0"
|
4 |
-
edition = "2021"
|
5 |
-
publish = false
|
6 |
-
|
7 |
-
[lib]
|
8 |
-
name = "migration"
|
9 |
-
path = "src/lib.rs"
|
10 |
-
|
11 |
-
[dependencies]
|
12 |
-
async-std = { version = "1", features = ["attributes", "tokio1"] }
|
13 |
-
chrono = "0.4.31"
|
14 |
-
|
15 |
-
[dependencies.sea-orm-migration]
|
16 |
-
version = "0.12.0"
|
17 |
-
features = [
|
18 |
-
"runtime-tokio-rustls", # `ASYNC_RUNTIME` feature
|
19 |
-
"sqlx-postgres", # `DATABASE_DRIVER` feature
|
20 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
migration/README.md
DELETED
@@ -1,41 +0,0 @@
|
|
1 |
-
# Running Migrator CLI
|
2 |
-
|
3 |
-
- Generate a new migration file
|
4 |
-
```sh
|
5 |
-
cargo run -- generate MIGRATION_NAME
|
6 |
-
```
|
7 |
-
- Apply all pending migrations
|
8 |
-
```sh
|
9 |
-
cargo run
|
10 |
-
```
|
11 |
-
```sh
|
12 |
-
cargo run -- up
|
13 |
-
```
|
14 |
-
- Apply first 10 pending migrations
|
15 |
-
```sh
|
16 |
-
cargo run -- up -n 10
|
17 |
-
```
|
18 |
-
- Rollback last applied migrations
|
19 |
-
```sh
|
20 |
-
cargo run -- down
|
21 |
-
```
|
22 |
-
- Rollback last 10 applied migrations
|
23 |
-
```sh
|
24 |
-
cargo run -- down -n 10
|
25 |
-
```
|
26 |
-
- Drop all tables from the database, then reapply all migrations
|
27 |
-
```sh
|
28 |
-
cargo run -- fresh
|
29 |
-
```
|
30 |
-
- Rollback all applied migrations, then reapply all migrations
|
31 |
-
```sh
|
32 |
-
cargo run -- refresh
|
33 |
-
```
|
34 |
-
- Rollback all applied migrations
|
35 |
-
```sh
|
36 |
-
cargo run -- reset
|
37 |
-
```
|
38 |
-
- Check the status of all migrations
|
39 |
-
```sh
|
40 |
-
cargo run -- status
|
41 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
migration/src/lib.rs
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
pub use sea_orm_migration::prelude::*;
|
2 |
-
|
3 |
-
mod m20220101_000001_create_table;
|
4 |
-
|
5 |
-
pub struct Migrator;
|
6 |
-
|
7 |
-
#[async_trait::async_trait]
|
8 |
-
impl MigratorTrait for Migrator {
|
9 |
-
fn migrations() -> Vec<Box<dyn MigrationTrait>> {
|
10 |
-
vec![Box::new(m20220101_000001_create_table::Migration)]
|
11 |
-
}
|
12 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
migration/src/m20220101_000001_create_table.rs
DELETED
@@ -1,440 +0,0 @@
|
|
1 |
-
use sea_orm_migration::prelude::*;
|
2 |
-
use chrono::{ FixedOffset, Utc };
|
3 |
-
|
4 |
-
#[allow(dead_code)]
|
5 |
-
fn now() -> chrono::DateTime<FixedOffset> {
|
6 |
-
Utc::now().with_timezone(&FixedOffset::east_opt(3600 * 8).unwrap())
|
7 |
-
}
|
8 |
-
#[derive(DeriveMigrationName)]
|
9 |
-
pub struct Migration;
|
10 |
-
|
11 |
-
#[async_trait::async_trait]
|
12 |
-
impl MigrationTrait for Migration {
|
13 |
-
async fn up(&self, manager: &SchemaManager) -> Result<(), DbErr> {
|
14 |
-
manager.create_table(
|
15 |
-
Table::create()
|
16 |
-
.table(UserInfo::Table)
|
17 |
-
.if_not_exists()
|
18 |
-
.col(
|
19 |
-
ColumnDef::new(UserInfo::Uid)
|
20 |
-
.big_integer()
|
21 |
-
.not_null()
|
22 |
-
.auto_increment()
|
23 |
-
.primary_key()
|
24 |
-
)
|
25 |
-
.col(ColumnDef::new(UserInfo::Email).string().not_null())
|
26 |
-
.col(ColumnDef::new(UserInfo::Nickname).string().not_null())
|
27 |
-
.col(ColumnDef::new(UserInfo::AvatarBase64).string())
|
28 |
-
.col(ColumnDef::new(UserInfo::ColorScheme).string().default("dark"))
|
29 |
-
.col(ColumnDef::new(UserInfo::ListStyle).string().default("list"))
|
30 |
-
.col(ColumnDef::new(UserInfo::Language).string().default("chinese"))
|
31 |
-
.col(ColumnDef::new(UserInfo::Password).string().not_null())
|
32 |
-
.col(
|
33 |
-
ColumnDef::new(UserInfo::LastLoginAt)
|
34 |
-
.timestamp_with_time_zone()
|
35 |
-
.default(Expr::current_timestamp())
|
36 |
-
)
|
37 |
-
.col(
|
38 |
-
ColumnDef::new(UserInfo::CreatedAt)
|
39 |
-
.timestamp_with_time_zone()
|
40 |
-
.default(Expr::current_timestamp())
|
41 |
-
.not_null()
|
42 |
-
)
|
43 |
-
.col(
|
44 |
-
ColumnDef::new(UserInfo::UpdatedAt)
|
45 |
-
.timestamp_with_time_zone()
|
46 |
-
.default(Expr::current_timestamp())
|
47 |
-
.not_null()
|
48 |
-
)
|
49 |
-
.col(ColumnDef::new(UserInfo::IsDeleted).boolean().default(false))
|
50 |
-
.to_owned()
|
51 |
-
).await?;
|
52 |
-
|
53 |
-
manager.create_table(
|
54 |
-
Table::create()
|
55 |
-
.table(TagInfo::Table)
|
56 |
-
.if_not_exists()
|
57 |
-
.col(
|
58 |
-
ColumnDef::new(TagInfo::Tid)
|
59 |
-
.big_integer()
|
60 |
-
.not_null()
|
61 |
-
.auto_increment()
|
62 |
-
.primary_key()
|
63 |
-
)
|
64 |
-
.col(ColumnDef::new(TagInfo::Uid).big_integer().not_null())
|
65 |
-
.col(ColumnDef::new(TagInfo::TagName).string().not_null())
|
66 |
-
.col(ColumnDef::new(TagInfo::Regx).string())
|
67 |
-
.col(ColumnDef::new(TagInfo::Color).tiny_unsigned().default(1))
|
68 |
-
.col(ColumnDef::new(TagInfo::Icon).tiny_unsigned().default(1))
|
69 |
-
.col(ColumnDef::new(TagInfo::FolderId).big_integer())
|
70 |
-
.col(
|
71 |
-
ColumnDef::new(TagInfo::CreatedAt)
|
72 |
-
.timestamp_with_time_zone()
|
73 |
-
.default(Expr::current_timestamp())
|
74 |
-
.not_null()
|
75 |
-
)
|
76 |
-
.col(
|
77 |
-
ColumnDef::new(TagInfo::UpdatedAt)
|
78 |
-
.timestamp_with_time_zone()
|
79 |
-
.default(Expr::current_timestamp())
|
80 |
-
.not_null()
|
81 |
-
)
|
82 |
-
.col(ColumnDef::new(TagInfo::IsDeleted).boolean().default(false))
|
83 |
-
.to_owned()
|
84 |
-
).await?;
|
85 |
-
|
86 |
-
manager.create_table(
|
87 |
-
Table::create()
|
88 |
-
.table(Tag2Doc::Table)
|
89 |
-
.if_not_exists()
|
90 |
-
.col(
|
91 |
-
ColumnDef::new(Tag2Doc::Id)
|
92 |
-
.big_integer()
|
93 |
-
.not_null()
|
94 |
-
.auto_increment()
|
95 |
-
.primary_key()
|
96 |
-
)
|
97 |
-
.col(ColumnDef::new(Tag2Doc::TagId).big_integer())
|
98 |
-
.col(ColumnDef::new(Tag2Doc::Did).big_integer())
|
99 |
-
.to_owned()
|
100 |
-
).await?;
|
101 |
-
|
102 |
-
manager.create_table(
|
103 |
-
Table::create()
|
104 |
-
.table(Kb2Doc::Table)
|
105 |
-
.if_not_exists()
|
106 |
-
.col(
|
107 |
-
ColumnDef::new(Kb2Doc::Id)
|
108 |
-
.big_integer()
|
109 |
-
.not_null()
|
110 |
-
.auto_increment()
|
111 |
-
.primary_key()
|
112 |
-
)
|
113 |
-
.col(ColumnDef::new(Kb2Doc::KbId).big_integer())
|
114 |
-
.col(ColumnDef::new(Kb2Doc::Did).big_integer())
|
115 |
-
.col(ColumnDef::new(Kb2Doc::KbProgress).float().default(0))
|
116 |
-
.col(ColumnDef::new(Kb2Doc::KbProgressMsg).string().default(""))
|
117 |
-
.col(
|
118 |
-
ColumnDef::new(Kb2Doc::UpdatedAt)
|
119 |
-
.timestamp_with_time_zone()
|
120 |
-
.default(Expr::current_timestamp())
|
121 |
-
.not_null()
|
122 |
-
)
|
123 |
-
.col(ColumnDef::new(Kb2Doc::IsDeleted).boolean().default(false))
|
124 |
-
.to_owned()
|
125 |
-
).await?;
|
126 |
-
|
127 |
-
manager.create_table(
|
128 |
-
Table::create()
|
129 |
-
.table(Dialog2Kb::Table)
|
130 |
-
.if_not_exists()
|
131 |
-
.col(
|
132 |
-
ColumnDef::new(Dialog2Kb::Id)
|
133 |
-
.big_integer()
|
134 |
-
.not_null()
|
135 |
-
.auto_increment()
|
136 |
-
.primary_key()
|
137 |
-
)
|
138 |
-
.col(ColumnDef::new(Dialog2Kb::DialogId).big_integer())
|
139 |
-
.col(ColumnDef::new(Dialog2Kb::KbId).big_integer())
|
140 |
-
.to_owned()
|
141 |
-
).await?;
|
142 |
-
|
143 |
-
manager.create_table(
|
144 |
-
Table::create()
|
145 |
-
.table(Doc2Doc::Table)
|
146 |
-
.if_not_exists()
|
147 |
-
.col(
|
148 |
-
ColumnDef::new(Doc2Doc::Id)
|
149 |
-
.big_integer()
|
150 |
-
.not_null()
|
151 |
-
.auto_increment()
|
152 |
-
.primary_key()
|
153 |
-
)
|
154 |
-
.col(ColumnDef::new(Doc2Doc::ParentId).big_integer())
|
155 |
-
.col(ColumnDef::new(Doc2Doc::Did).big_integer())
|
156 |
-
.to_owned()
|
157 |
-
).await?;
|
158 |
-
|
159 |
-
manager.create_table(
|
160 |
-
Table::create()
|
161 |
-
.table(KbInfo::Table)
|
162 |
-
.if_not_exists()
|
163 |
-
.col(
|
164 |
-
ColumnDef::new(KbInfo::KbId)
|
165 |
-
.big_integer()
|
166 |
-
.auto_increment()
|
167 |
-
.not_null()
|
168 |
-
.primary_key()
|
169 |
-
)
|
170 |
-
.col(ColumnDef::new(KbInfo::Uid).big_integer().not_null())
|
171 |
-
.col(ColumnDef::new(KbInfo::KbName).string().not_null())
|
172 |
-
.col(ColumnDef::new(KbInfo::Icon).tiny_unsigned().default(1))
|
173 |
-
.col(
|
174 |
-
ColumnDef::new(KbInfo::CreatedAt)
|
175 |
-
.timestamp_with_time_zone()
|
176 |
-
.default(Expr::current_timestamp())
|
177 |
-
.not_null()
|
178 |
-
)
|
179 |
-
.col(
|
180 |
-
ColumnDef::new(KbInfo::UpdatedAt)
|
181 |
-
.timestamp_with_time_zone()
|
182 |
-
.default(Expr::current_timestamp())
|
183 |
-
.not_null()
|
184 |
-
)
|
185 |
-
.col(ColumnDef::new(KbInfo::IsDeleted).boolean().default(false))
|
186 |
-
.to_owned()
|
187 |
-
).await?;
|
188 |
-
|
189 |
-
manager.create_table(
|
190 |
-
Table::create()
|
191 |
-
.table(DocInfo::Table)
|
192 |
-
.if_not_exists()
|
193 |
-
.col(
|
194 |
-
ColumnDef::new(DocInfo::Did)
|
195 |
-
.big_integer()
|
196 |
-
.not_null()
|
197 |
-
.auto_increment()
|
198 |
-
.primary_key()
|
199 |
-
)
|
200 |
-
.col(ColumnDef::new(DocInfo::Uid).big_integer().not_null())
|
201 |
-
.col(ColumnDef::new(DocInfo::DocName).string().not_null())
|
202 |
-
.col(ColumnDef::new(DocInfo::Location).string().not_null())
|
203 |
-
.col(ColumnDef::new(DocInfo::Size).big_integer().not_null())
|
204 |
-
.col(ColumnDef::new(DocInfo::Type).string().not_null())
|
205 |
-
.col(ColumnDef::new(DocInfo::ThumbnailBase64).string().default(""))
|
206 |
-
.comment("doc type|folder")
|
207 |
-
.col(
|
208 |
-
ColumnDef::new(DocInfo::CreatedAt)
|
209 |
-
.timestamp_with_time_zone()
|
210 |
-
.default(Expr::current_timestamp())
|
211 |
-
.not_null()
|
212 |
-
)
|
213 |
-
.col(
|
214 |
-
ColumnDef::new(DocInfo::UpdatedAt)
|
215 |
-
.timestamp_with_time_zone()
|
216 |
-
.default(Expr::current_timestamp())
|
217 |
-
.not_null()
|
218 |
-
)
|
219 |
-
.col(ColumnDef::new(DocInfo::IsDeleted).boolean().default(false))
|
220 |
-
.to_owned()
|
221 |
-
).await?;
|
222 |
-
|
223 |
-
manager.create_table(
|
224 |
-
Table::create()
|
225 |
-
.table(DialogInfo::Table)
|
226 |
-
.if_not_exists()
|
227 |
-
.col(
|
228 |
-
ColumnDef::new(DialogInfo::DialogId)
|
229 |
-
.big_integer()
|
230 |
-
.not_null()
|
231 |
-
.auto_increment()
|
232 |
-
.primary_key()
|
233 |
-
)
|
234 |
-
.col(ColumnDef::new(DialogInfo::Uid).big_integer().not_null())
|
235 |
-
.col(ColumnDef::new(DialogInfo::KbId).big_integer().not_null())
|
236 |
-
.col(ColumnDef::new(DialogInfo::DialogName).string().not_null())
|
237 |
-
.col(ColumnDef::new(DialogInfo::History).string().comment("json"))
|
238 |
-
.col(
|
239 |
-
ColumnDef::new(DialogInfo::CreatedAt)
|
240 |
-
.timestamp_with_time_zone()
|
241 |
-
.default(Expr::current_timestamp())
|
242 |
-
.not_null()
|
243 |
-
)
|
244 |
-
.col(
|
245 |
-
ColumnDef::new(DialogInfo::UpdatedAt)
|
246 |
-
.timestamp_with_time_zone()
|
247 |
-
.default(Expr::current_timestamp())
|
248 |
-
.not_null()
|
249 |
-
)
|
250 |
-
.col(ColumnDef::new(DialogInfo::IsDeleted).boolean().default(false))
|
251 |
-
.to_owned()
|
252 |
-
).await?;
|
253 |
-
|
254 |
-
let root_insert = Query::insert()
|
255 |
-
.into_table(UserInfo::Table)
|
256 |
-
.columns([UserInfo::Email, UserInfo::Nickname, UserInfo::Password])
|
257 |
-
.values_panic(["[email protected]".into(), "root".into(), "123456".into()])
|
258 |
-
.to_owned();
|
259 |
-
|
260 |
-
let doc_insert = Query::insert()
|
261 |
-
.into_table(DocInfo::Table)
|
262 |
-
.columns([
|
263 |
-
DocInfo::Uid,
|
264 |
-
DocInfo::DocName,
|
265 |
-
DocInfo::Size,
|
266 |
-
DocInfo::Type,
|
267 |
-
DocInfo::Location,
|
268 |
-
])
|
269 |
-
.values_panic([(1).into(), "/".into(), (0).into(), "folder".into(), "".into()])
|
270 |
-
.to_owned();
|
271 |
-
|
272 |
-
let tag_insert = Query::insert()
|
273 |
-
.into_table(TagInfo::Table)
|
274 |
-
.columns([TagInfo::Uid, TagInfo::TagName, TagInfo::Regx, TagInfo::Color, TagInfo::Icon])
|
275 |
-
.values_panic([
|
276 |
-
(1).into(),
|
277 |
-
"Video".into(),
|
278 |
-
".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)".into(),
|
279 |
-
(1).into(),
|
280 |
-
(1).into(),
|
281 |
-
])
|
282 |
-
.values_panic([
|
283 |
-
(1).into(),
|
284 |
-
"Picture".into(),
|
285 |
-
".*\\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)".into(),
|
286 |
-
(2).into(),
|
287 |
-
(2).into(),
|
288 |
-
])
|
289 |
-
.values_panic([
|
290 |
-
(1).into(),
|
291 |
-
"Music".into(),
|
292 |
-
".*\\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)".into(),
|
293 |
-
(3).into(),
|
294 |
-
(3).into(),
|
295 |
-
])
|
296 |
-
.values_panic([
|
297 |
-
(1).into(),
|
298 |
-
"Document".into(),
|
299 |
-
".*\\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key)".into(),
|
300 |
-
(3).into(),
|
301 |
-
(3).into(),
|
302 |
-
])
|
303 |
-
.to_owned();
|
304 |
-
|
305 |
-
manager.exec_stmt(root_insert).await?;
|
306 |
-
manager.exec_stmt(doc_insert).await?;
|
307 |
-
manager.exec_stmt(tag_insert).await?;
|
308 |
-
Ok(())
|
309 |
-
}
|
310 |
-
|
311 |
-
async fn down(&self, manager: &SchemaManager) -> Result<(), DbErr> {
|
312 |
-
manager.drop_table(Table::drop().table(UserInfo::Table).to_owned()).await?;
|
313 |
-
|
314 |
-
manager.drop_table(Table::drop().table(TagInfo::Table).to_owned()).await?;
|
315 |
-
|
316 |
-
manager.drop_table(Table::drop().table(Tag2Doc::Table).to_owned()).await?;
|
317 |
-
|
318 |
-
manager.drop_table(Table::drop().table(Kb2Doc::Table).to_owned()).await?;
|
319 |
-
|
320 |
-
manager.drop_table(Table::drop().table(Dialog2Kb::Table).to_owned()).await?;
|
321 |
-
|
322 |
-
manager.drop_table(Table::drop().table(Doc2Doc::Table).to_owned()).await?;
|
323 |
-
|
324 |
-
manager.drop_table(Table::drop().table(KbInfo::Table).to_owned()).await?;
|
325 |
-
|
326 |
-
manager.drop_table(Table::drop().table(DocInfo::Table).to_owned()).await?;
|
327 |
-
|
328 |
-
manager.drop_table(Table::drop().table(DialogInfo::Table).to_owned()).await?;
|
329 |
-
|
330 |
-
Ok(())
|
331 |
-
}
|
332 |
-
}
|
333 |
-
|
334 |
-
#[derive(DeriveIden)]
|
335 |
-
enum UserInfo {
|
336 |
-
Table,
|
337 |
-
Uid,
|
338 |
-
Email,
|
339 |
-
Nickname,
|
340 |
-
AvatarBase64,
|
341 |
-
ColorScheme,
|
342 |
-
ListStyle,
|
343 |
-
Language,
|
344 |
-
Password,
|
345 |
-
LastLoginAt,
|
346 |
-
CreatedAt,
|
347 |
-
UpdatedAt,
|
348 |
-
IsDeleted,
|
349 |
-
}
|
350 |
-
|
351 |
-
#[derive(DeriveIden)]
|
352 |
-
enum TagInfo {
|
353 |
-
Table,
|
354 |
-
Tid,
|
355 |
-
Uid,
|
356 |
-
TagName,
|
357 |
-
Regx,
|
358 |
-
Color,
|
359 |
-
Icon,
|
360 |
-
FolderId,
|
361 |
-
CreatedAt,
|
362 |
-
UpdatedAt,
|
363 |
-
IsDeleted,
|
364 |
-
}
|
365 |
-
|
366 |
-
#[derive(DeriveIden)]
|
367 |
-
enum Tag2Doc {
|
368 |
-
Table,
|
369 |
-
Id,
|
370 |
-
TagId,
|
371 |
-
Did,
|
372 |
-
}
|
373 |
-
|
374 |
-
#[derive(DeriveIden)]
|
375 |
-
enum Kb2Doc {
|
376 |
-
Table,
|
377 |
-
Id,
|
378 |
-
KbId,
|
379 |
-
Did,
|
380 |
-
KbProgress,
|
381 |
-
KbProgressMsg,
|
382 |
-
UpdatedAt,
|
383 |
-
IsDeleted,
|
384 |
-
}
|
385 |
-
|
386 |
-
#[derive(DeriveIden)]
|
387 |
-
enum Dialog2Kb {
|
388 |
-
Table,
|
389 |
-
Id,
|
390 |
-
DialogId,
|
391 |
-
KbId,
|
392 |
-
}
|
393 |
-
|
394 |
-
#[derive(DeriveIden)]
|
395 |
-
enum Doc2Doc {
|
396 |
-
Table,
|
397 |
-
Id,
|
398 |
-
ParentId,
|
399 |
-
Did,
|
400 |
-
}
|
401 |
-
|
402 |
-
#[derive(DeriveIden)]
|
403 |
-
enum KbInfo {
|
404 |
-
Table,
|
405 |
-
KbId,
|
406 |
-
Uid,
|
407 |
-
KbName,
|
408 |
-
Icon,
|
409 |
-
CreatedAt,
|
410 |
-
UpdatedAt,
|
411 |
-
IsDeleted,
|
412 |
-
}
|
413 |
-
|
414 |
-
#[derive(DeriveIden)]
|
415 |
-
enum DocInfo {
|
416 |
-
Table,
|
417 |
-
Did,
|
418 |
-
Uid,
|
419 |
-
DocName,
|
420 |
-
Location,
|
421 |
-
Size,
|
422 |
-
Type,
|
423 |
-
ThumbnailBase64,
|
424 |
-
CreatedAt,
|
425 |
-
UpdatedAt,
|
426 |
-
IsDeleted,
|
427 |
-
}
|
428 |
-
|
429 |
-
#[derive(DeriveIden)]
|
430 |
-
enum DialogInfo {
|
431 |
-
Table,
|
432 |
-
Uid,
|
433 |
-
KbId,
|
434 |
-
DialogId,
|
435 |
-
DialogName,
|
436 |
-
History,
|
437 |
-
CreatedAt,
|
438 |
-
UpdatedAt,
|
439 |
-
IsDeleted,
|
440 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
migration/src/main.rs
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
use sea_orm_migration::prelude::*;
|
2 |
-
|
3 |
-
#[async_std::main]
|
4 |
-
async fn main() {
|
5 |
-
cli::run_cli(migration::Migrator).await;
|
6 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
python/Dockerfile
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM ubuntu:22.04 as base
|
2 |
+
|
3 |
+
RUN apt-get update
|
4 |
+
|
5 |
+
ENV TZ="Asia/Taipei"
|
6 |
+
RUN apt-get install -yq \
|
7 |
+
build-essential \
|
8 |
+
curl \
|
9 |
+
libncursesw5-dev \
|
10 |
+
libssl-dev \
|
11 |
+
libsqlite3-dev \
|
12 |
+
libgdbm-dev \
|
13 |
+
libc6-dev \
|
14 |
+
libbz2-dev \
|
15 |
+
software-properties-common \
|
16 |
+
python3.11 python3.11-dev python3-pip
|
17 |
+
|
18 |
+
RUN apt-get install -yq git
|
19 |
+
RUN pip3 config set global.index-url https://mirror.baidu.com/pypi/simple
|
20 |
+
RUN pip3 config set global.trusted-host mirror.baidu.com
|
21 |
+
RUN pip3 install --upgrade pip
|
22 |
+
RUN pip3 install torch==2.0.1
|
23 |
+
RUN pip3 install torch-model-archiver==0.8.2
|
24 |
+
RUN pip3 install torchvision==0.15.2
|
25 |
+
COPY requirements.txt .
|
26 |
+
|
27 |
+
WORKDIR /docgpt
|
28 |
+
ENV PYTHONPATH=/docgpt/
|
29 |
+
|
python/README.md
DELETED
@@ -1,22 +0,0 @@
|
|
1 |
-
|
2 |
-
```shell
|
3 |
-
|
4 |
-
docker pull postgres
|
5 |
-
|
6 |
-
LOCAL_POSTGRES_DATA=./postgres-data
|
7 |
-
|
8 |
-
docker run
|
9 |
-
--name docass-postgres
|
10 |
-
-p 5455:5432
|
11 |
-
-v $LOCAL_POSTGRES_DATA:/var/lib/postgresql/data
|
12 |
-
-e POSTGRES_USER=root
|
13 |
-
-e POSTGRES_PASSWORD=infiniflow_docass
|
14 |
-
-e POSTGRES_DB=docass
|
15 |
-
-d
|
16 |
-
postgres
|
17 |
-
|
18 |
-
docker network create elastic
|
19 |
-
docker pull elasticsearch:8.11.3;
|
20 |
-
docker pull docker.elastic.co/kibana/kibana:8.11.3
|
21 |
-
|
22 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
python/{nlp/__init__.py → ToPDF.pdf}
RENAMED
File without changes
|
python/]
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC
|
2 |
+
from openai import OpenAI
|
3 |
+
import os
|
4 |
+
import base64
|
5 |
+
from io import BytesIO
|
6 |
+
|
7 |
+
class Base(ABC):
|
8 |
+
def describe(self, image, max_tokens=300):
|
9 |
+
raise NotImplementedError("Please implement encode method!")
|
10 |
+
|
11 |
+
|
12 |
+
class GptV4(Base):
|
13 |
+
def __init__(self):
|
14 |
+
import openapi
|
15 |
+
openapi.api_key = os.environ["OPENAPI_KEY"]
|
16 |
+
self.client = OpenAI()
|
17 |
+
|
18 |
+
def describe(self, image, max_tokens=300):
|
19 |
+
buffered = BytesIO()
|
20 |
+
try:
|
21 |
+
image.save(buffered, format="JPEG")
|
22 |
+
except Exception as e:
|
23 |
+
image.save(buffered, format="PNG")
|
24 |
+
b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
25 |
+
|
26 |
+
res = self.client.chat.completions.create(
|
27 |
+
model="gpt-4-vision-preview",
|
28 |
+
messages=[
|
29 |
+
{
|
30 |
+
"role": "user",
|
31 |
+
"content": [
|
32 |
+
{
|
33 |
+
"type": "text",
|
34 |
+
"text": "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等。",
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"type": "image_url",
|
38 |
+
"image_url": {
|
39 |
+
"url": f"data:image/jpeg;base64,{b64}"
|
40 |
+
},
|
41 |
+
},
|
42 |
+
],
|
43 |
+
}
|
44 |
+
],
|
45 |
+
max_tokens=max_tokens,
|
46 |
+
)
|
47 |
+
return res.choices[0].message.content.strip()
|
48 |
+
|
49 |
+
|
50 |
+
class QWen(Base):
|
51 |
+
def chat(self, system, history, gen_conf):
|
52 |
+
from http import HTTPStatus
|
53 |
+
from dashscope import Generation
|
54 |
+
from dashscope.api_entities.dashscope_response import Role
|
55 |
+
# export DASHSCOPE_API_KEY=YOUR_DASHSCOPE_API_KEY
|
56 |
+
response = Generation.call(
|
57 |
+
Generation.Models.qwen_turbo,
|
58 |
+
messages=messages,
|
59 |
+
result_format='message'
|
60 |
+
)
|
61 |
+
if response.status_code == HTTPStatus.OK:
|
62 |
+
return response.output.choices[0]['message']['content']
|
63 |
+
return response.message
|
python/conf/logging.json
DELETED
@@ -1,41 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"version":1,
|
3 |
-
"disable_existing_loggers":false,
|
4 |
-
"formatters":{
|
5 |
-
"simple":{
|
6 |
-
"format":"%(asctime)s - %(name)s - %(levelname)s - %(filename)s - %(lineno)d - %(message)s"
|
7 |
-
}
|
8 |
-
},
|
9 |
-
"handlers":{
|
10 |
-
"console":{
|
11 |
-
"class":"logging.StreamHandler",
|
12 |
-
"level":"DEBUG",
|
13 |
-
"formatter":"simple",
|
14 |
-
"stream":"ext://sys.stdout"
|
15 |
-
},
|
16 |
-
"info_file_handler":{
|
17 |
-
"class":"logging.handlers.TimedRotatingFileHandler",
|
18 |
-
"level":"INFO",
|
19 |
-
"formatter":"simple",
|
20 |
-
"filename":"log/info.log",
|
21 |
-
"when": "MIDNIGHT",
|
22 |
-
"interval":1,
|
23 |
-
"backupCount":30,
|
24 |
-
"encoding":"utf8"
|
25 |
-
},
|
26 |
-
"error_file_handler":{
|
27 |
-
"class":"logging.handlers.TimedRotatingFileHandler",
|
28 |
-
"level":"ERROR",
|
29 |
-
"formatter":"simple",
|
30 |
-
"filename":"log/errors.log",
|
31 |
-
"when": "MIDNIGHT",
|
32 |
-
"interval":1,
|
33 |
-
"backupCount":30,
|
34 |
-
"encoding":"utf8"
|
35 |
-
}
|
36 |
-
},
|
37 |
-
"root":{
|
38 |
-
"level":"DEBUG",
|
39 |
-
"handlers":["console","info_file_handler","error_file_handler"]
|
40 |
-
}
|
41 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
python/conf/sys.cnf
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
[infiniflow]
|
2 |
-
es=http://es01:9200
|
3 |
-
postgres_user=root
|
4 |
-
postgres_password=infiniflow_docgpt
|
5 |
-
postgres_host=postgres
|
6 |
-
postgres_port=5432
|
7 |
-
minio_host=minio:9000
|
8 |
-
minio_user=infiniflow
|
9 |
-
minio_password=infiniflow_docgpt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
python/llm/__init__.py
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from .embedding_model import *
|
3 |
-
from .chat_model import *
|
4 |
-
from .cv_model import *
|
5 |
-
|
6 |
-
EmbeddingModel = None
|
7 |
-
ChatModel = None
|
8 |
-
CvModel = None
|
9 |
-
|
10 |
-
|
11 |
-
if os.environ.get("OPENAI_API_KEY"):
|
12 |
-
EmbeddingModel = GptEmbed()
|
13 |
-
ChatModel = GptTurbo()
|
14 |
-
CvModel = GptV4()
|
15 |
-
|
16 |
-
elif os.environ.get("DASHSCOPE_API_KEY"):
|
17 |
-
EmbeddingModel = QWenEmbd()
|
18 |
-
ChatModel = QWenChat()
|
19 |
-
CvModel = QWenCV()
|
20 |
-
else:
|
21 |
-
EmbeddingModel = HuEmbedding()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
python/output/ToPDF.pdf
ADDED
File without changes
|
python/requirements.txt
DELETED
@@ -1,194 +0,0 @@
|
|
1 |
-
accelerate==0.24.1
|
2 |
-
addict==2.4.0
|
3 |
-
aiobotocore==2.7.0
|
4 |
-
aiofiles==23.2.1
|
5 |
-
aiohttp==3.8.6
|
6 |
-
aioitertools==0.11.0
|
7 |
-
aiosignal==1.3.1
|
8 |
-
aliyun-python-sdk-core==2.14.0
|
9 |
-
aliyun-python-sdk-kms==2.16.2
|
10 |
-
altair==5.1.2
|
11 |
-
anyio==3.7.1
|
12 |
-
astor==0.8.1
|
13 |
-
async-timeout==4.0.3
|
14 |
-
attrdict==2.0.1
|
15 |
-
attrs==23.1.0
|
16 |
-
Babel==2.13.1
|
17 |
-
bce-python-sdk==0.8.92
|
18 |
-
beautifulsoup4==4.12.2
|
19 |
-
bitsandbytes==0.41.1
|
20 |
-
blinker==1.7.0
|
21 |
-
botocore==1.31.64
|
22 |
-
cachetools==5.3.2
|
23 |
-
certifi==2023.7.22
|
24 |
-
cffi==1.16.0
|
25 |
-
charset-normalizer==3.3.2
|
26 |
-
click==8.1.7
|
27 |
-
cloudpickle==3.0.0
|
28 |
-
contourpy==1.2.0
|
29 |
-
crcmod==1.7
|
30 |
-
cryptography==41.0.5
|
31 |
-
cssselect==1.2.0
|
32 |
-
cssutils==2.9.0
|
33 |
-
cycler==0.12.1
|
34 |
-
Cython==3.0.5
|
35 |
-
datasets==2.13.0
|
36 |
-
datrie==0.8.2
|
37 |
-
decorator==5.1.1
|
38 |
-
defusedxml==0.7.1
|
39 |
-
dill==0.3.6
|
40 |
-
einops==0.7.0
|
41 |
-
elastic-transport==8.10.0
|
42 |
-
elasticsearch==8.10.1
|
43 |
-
elasticsearch-dsl==8.9.0
|
44 |
-
et-xmlfile==1.1.0
|
45 |
-
fastapi==0.104.1
|
46 |
-
ffmpy==0.3.1
|
47 |
-
filelock==3.13.1
|
48 |
-
fire==0.5.0
|
49 |
-
FlagEmbedding==1.1.5
|
50 |
-
Flask==3.0.0
|
51 |
-
flask-babel==4.0.0
|
52 |
-
fonttools==4.44.0
|
53 |
-
frozenlist==1.4.0
|
54 |
-
fsspec==2023.10.0
|
55 |
-
future==0.18.3
|
56 |
-
gast==0.5.4
|
57 |
-
-e
|
58 |
-
git+https://github.com/ggerganov/llama.cpp.git@5f6e0c0dff1e7a89331e6b25eca9a9fd71324069#egg=gguf&subdirectory=gguf-py
|
59 |
-
gradio==3.50.2
|
60 |
-
gradio_client==0.6.1
|
61 |
-
greenlet==3.0.1
|
62 |
-
h11==0.14.0
|
63 |
-
hanziconv==0.3.2
|
64 |
-
httpcore==1.0.1
|
65 |
-
httpx==0.25.1
|
66 |
-
huggingface-hub==0.17.3
|
67 |
-
idna==3.4
|
68 |
-
imageio==2.31.6
|
69 |
-
imgaug==0.4.0
|
70 |
-
importlib-metadata==6.8.0
|
71 |
-
importlib-resources==6.1.0
|
72 |
-
install==1.3.5
|
73 |
-
itsdangerous==2.1.2
|
74 |
-
Jinja2==3.1.2
|
75 |
-
jmespath==0.10.0
|
76 |
-
joblib==1.3.2
|
77 |
-
jsonschema==4.19.2
|
78 |
-
jsonschema-specifications==2023.7.1
|
79 |
-
kiwisolver==1.4.5
|
80 |
-
lazy_loader==0.3
|
81 |
-
lmdb==1.4.1
|
82 |
-
lxml==4.9.3
|
83 |
-
MarkupSafe==2.1.3
|
84 |
-
matplotlib==3.8.1
|
85 |
-
modelscope==1.9.4
|
86 |
-
mpmath==1.3.0
|
87 |
-
multidict==6.0.4
|
88 |
-
multiprocess==0.70.14
|
89 |
-
networkx==3.2.1
|
90 |
-
nltk==3.8.1
|
91 |
-
numpy==1.24.4
|
92 |
-
nvidia-cublas-cu12==12.1.3.1
|
93 |
-
nvidia-cuda-cupti-cu12==12.1.105
|
94 |
-
nvidia-cuda-nvrtc-cu12==12.1.105
|
95 |
-
nvidia-cuda-runtime-cu12==12.1.105
|
96 |
-
nvidia-cudnn-cu12==8.9.2.26
|
97 |
-
nvidia-cufft-cu12==11.0.2.54
|
98 |
-
nvidia-curand-cu12==10.3.2.106
|
99 |
-
nvidia-cusolver-cu12==11.4.5.107
|
100 |
-
nvidia-cusparse-cu12==12.1.0.106
|
101 |
-
nvidia-nccl-cu12==2.18.1
|
102 |
-
nvidia-nvjitlink-cu12==12.3.52
|
103 |
-
nvidia-nvtx-cu12==12.1.105
|
104 |
-
opencv-contrib-python==4.6.0.66
|
105 |
-
opencv-python==4.6.0.66
|
106 |
-
openpyxl==3.1.2
|
107 |
-
opt-einsum==3.3.0
|
108 |
-
orjson==3.9.10
|
109 |
-
oss2==2.18.3
|
110 |
-
packaging==23.2
|
111 |
-
paddleocr==2.7.0.3
|
112 |
-
paddlepaddle-gpu==2.5.2.post120
|
113 |
-
pandas==2.1.2
|
114 |
-
pdf2docx==0.5.5
|
115 |
-
pdfminer.six==20221105
|
116 |
-
pdfplumber==0.10.3
|
117 |
-
Pillow==10.0.1
|
118 |
-
platformdirs==3.11.0
|
119 |
-
premailer==3.10.0
|
120 |
-
protobuf==4.25.0
|
121 |
-
psutil==5.9.6
|
122 |
-
pyarrow==14.0.0
|
123 |
-
pyclipper==1.3.0.post5
|
124 |
-
pycocotools==2.0.7
|
125 |
-
pycparser==2.21
|
126 |
-
pycryptodome==3.19.0
|
127 |
-
pydantic==1.10.13
|
128 |
-
pydub==0.25.1
|
129 |
-
PyMuPDF==1.20.2
|
130 |
-
pyparsing==3.1.1
|
131 |
-
pypdfium2==4.23.1
|
132 |
-
python-dateutil==2.8.2
|
133 |
-
python-docx==1.1.0
|
134 |
-
python-multipart==0.0.6
|
135 |
-
pytz==2023.3.post1
|
136 |
-
PyYAML==6.0.1
|
137 |
-
rapidfuzz==3.5.2
|
138 |
-
rarfile==4.1
|
139 |
-
referencing==0.30.2
|
140 |
-
regex==2023.10.3
|
141 |
-
requests==2.31.0
|
142 |
-
rpds-py==0.12.0
|
143 |
-
s3fs==2023.10.0
|
144 |
-
safetensors==0.4.0
|
145 |
-
scikit-image==0.22.0
|
146 |
-
scikit-learn==1.3.2
|
147 |
-
scipy==1.11.3
|
148 |
-
semantic-version==2.10.0
|
149 |
-
sentence-transformers==2.2.2
|
150 |
-
sentencepiece==0.1.98
|
151 |
-
shapely==2.0.2
|
152 |
-
simplejson==3.19.2
|
153 |
-
six==1.16.0
|
154 |
-
sniffio==1.3.0
|
155 |
-
sortedcontainers==2.4.0
|
156 |
-
soupsieve==2.5
|
157 |
-
SQLAlchemy==2.0.23
|
158 |
-
starlette==0.27.0
|
159 |
-
sympy==1.12
|
160 |
-
tabulate==0.9.0
|
161 |
-
tblib==3.0.0
|
162 |
-
termcolor==2.3.0
|
163 |
-
threadpoolctl==3.2.0
|
164 |
-
tifffile==2023.9.26
|
165 |
-
tiktoken==0.5.1
|
166 |
-
timm==0.9.10
|
167 |
-
tokenizers==0.13.3
|
168 |
-
tomli==2.0.1
|
169 |
-
toolz==0.12.0
|
170 |
-
torch==2.1.0
|
171 |
-
torchaudio==2.1.0
|
172 |
-
torchvision==0.16.0
|
173 |
-
tornado==6.3.3
|
174 |
-
tqdm==4.66.1
|
175 |
-
transformers==4.33.0
|
176 |
-
transformers-stream-generator==0.0.4
|
177 |
-
triton==2.1.0
|
178 |
-
typing_extensions==4.8.0
|
179 |
-
tzdata==2023.3
|
180 |
-
urllib3==2.0.7
|
181 |
-
uvicorn==0.24.0
|
182 |
-
uvloop==0.19.0
|
183 |
-
visualdl==2.5.3
|
184 |
-
websockets==11.0.3
|
185 |
-
Werkzeug==3.0.1
|
186 |
-
wrapt==1.15.0
|
187 |
-
xgboost==2.0.1
|
188 |
-
xinference==0.6.0
|
189 |
-
xorbits==0.7.0
|
190 |
-
xoscar==0.1.3
|
191 |
-
xxhash==3.4.1
|
192 |
-
yapf==0.40.2
|
193 |
-
yarl==1.9.2
|
194 |
-
zipp==3.17.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
python/res/1-0.tm
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2023-12-20 11:44:08.791336+00:00
|
2 |
+
2023-12-20 11:44:08.853249+00:00
|
3 |
+
2023-12-20 11:44:08.909933+00:00
|
4 |
+
2023-12-21 00:47:09.996757+00:00
|
5 |
+
2023-12-20 11:44:08.965855+00:00
|
6 |
+
2023-12-20 11:44:09.011682+00:00
|
7 |
+
2023-12-21 00:47:10.063326+00:00
|
8 |
+
2023-12-20 11:44:09.069486+00:00
|
python/res/thumbnail-1-0.tm
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
2023-12-27 08:21:49.309802+00:00
|
2 |
+
2023-12-27 08:37:22.407772+00:00
|
3 |
+
2023-12-27 08:59:18.845627+00:00
|
python/svr/add_thumbnail2file.py
DELETED
@@ -1,118 +0,0 @@
|
|
1 |
-
import sys, datetime, random, re, cv2
|
2 |
-
from os.path import dirname, realpath
|
3 |
-
sys.path.append(dirname(realpath(__file__)) + "/../")
|
4 |
-
from util.db_conn import Postgres
|
5 |
-
from util.minio_conn import HuMinio
|
6 |
-
from util import findMaxDt
|
7 |
-
import base64
|
8 |
-
from io import BytesIO
|
9 |
-
import pandas as pd
|
10 |
-
from PIL import Image
|
11 |
-
import pdfplumber
|
12 |
-
|
13 |
-
|
14 |
-
PG = Postgres("infiniflow", "docgpt")
|
15 |
-
MINIO = HuMinio("infiniflow")
|
16 |
-
def set_thumbnail(did, base64):
|
17 |
-
sql = f"""
|
18 |
-
update doc_info set thumbnail_base64='{base64}'
|
19 |
-
where
|
20 |
-
did={did}
|
21 |
-
"""
|
22 |
-
PG.update(sql)
|
23 |
-
|
24 |
-
|
25 |
-
def collect(comm, mod, tm):
|
26 |
-
sql = f"""
|
27 |
-
select
|
28 |
-
did, uid, doc_name, location, updated_at
|
29 |
-
from doc_info
|
30 |
-
where
|
31 |
-
updated_at >= '{tm}'
|
32 |
-
and MOD(did, {comm}) = {mod}
|
33 |
-
and is_deleted=false
|
34 |
-
and type <> 'folder'
|
35 |
-
and thumbnail_base64=''
|
36 |
-
order by updated_at asc
|
37 |
-
limit 10
|
38 |
-
"""
|
39 |
-
docs = PG.select(sql)
|
40 |
-
if len(docs) == 0:return pd.DataFrame()
|
41 |
-
|
42 |
-
mtm = str(docs["updated_at"].max())[:19]
|
43 |
-
print("TOTAL:", len(docs), "To: ", mtm)
|
44 |
-
return docs
|
45 |
-
|
46 |
-
|
47 |
-
def build(row):
|
48 |
-
if not re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|apng|icon|ico|webp|mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$",
|
49 |
-
row["doc_name"].lower().strip()):
|
50 |
-
set_thumbnail(row["did"], "_")
|
51 |
-
return
|
52 |
-
|
53 |
-
def thumbnail(img, SIZE=128):
|
54 |
-
w,h = img.size
|
55 |
-
p = SIZE/max(w, h)
|
56 |
-
w, h = int(w*p), int(h*p)
|
57 |
-
img.thumbnail((w, h))
|
58 |
-
buffered = BytesIO()
|
59 |
-
try:
|
60 |
-
img.save(buffered, format="JPEG")
|
61 |
-
except Exception as e:
|
62 |
-
try:
|
63 |
-
img.save(buffered, format="PNG")
|
64 |
-
except Exception as ee:
|
65 |
-
pass
|
66 |
-
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
67 |
-
|
68 |
-
|
69 |
-
iobytes = BytesIO(MINIO.get("%s-upload"%str(row["uid"]), row["location"]))
|
70 |
-
if re.search(r"\.pdf$", row["doc_name"].lower().strip()):
|
71 |
-
pdf = pdfplumber.open(iobytes)
|
72 |
-
img = pdf.pages[0].to_image().annotated
|
73 |
-
set_thumbnail(row["did"], thumbnail(img))
|
74 |
-
|
75 |
-
if re.search(r"\.(jpg|jpeg|png|gif|svg|apng|webp|icon|ico)$", row["doc_name"].lower().strip()):
|
76 |
-
img = Image.open(iobytes)
|
77 |
-
set_thumbnail(row["did"], thumbnail(img))
|
78 |
-
|
79 |
-
if re.search(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$", row["doc_name"].lower().strip()):
|
80 |
-
url = MINIO.get_presigned_url("%s-upload"%str(row["uid"]),
|
81 |
-
row["location"],
|
82 |
-
expires=datetime.timedelta(seconds=60)
|
83 |
-
)
|
84 |
-
cap = cv2.VideoCapture(url)
|
85 |
-
succ = cap.isOpened()
|
86 |
-
i = random.randint(1, 11)
|
87 |
-
while succ:
|
88 |
-
ret, frame = cap.read()
|
89 |
-
if not ret: break
|
90 |
-
if i > 0:
|
91 |
-
i -= 1
|
92 |
-
continue
|
93 |
-
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
94 |
-
print(img.size)
|
95 |
-
set_thumbnail(row["did"], thumbnail(img))
|
96 |
-
cap.release()
|
97 |
-
cv2.destroyAllWindows()
|
98 |
-
|
99 |
-
|
100 |
-
def main(comm, mod):
|
101 |
-
global model
|
102 |
-
tm_fnm = f"res/thumbnail-{comm}-{mod}.tm"
|
103 |
-
tm = findMaxDt(tm_fnm)
|
104 |
-
rows = collect(comm, mod, tm)
|
105 |
-
if len(rows) == 0:return
|
106 |
-
|
107 |
-
tmf = open(tm_fnm, "a+")
|
108 |
-
for _, r in rows.iterrows():
|
109 |
-
build(r)
|
110 |
-
tmf.write(str(r["updated_at"]) + "\n")
|
111 |
-
tmf.close()
|
112 |
-
|
113 |
-
|
114 |
-
if __name__ == "__main__":
|
115 |
-
from mpi4py import MPI
|
116 |
-
comm = MPI.COMM_WORLD
|
117 |
-
main(comm.Get_size(), comm.Get_rank())
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
python/svr/dialog_svr.py
DELETED
@@ -1,165 +0,0 @@
|
|
1 |
-
#-*- coding:utf-8 -*-
|
2 |
-
import sys, os, re,inspect,json,traceback,logging,argparse, copy
|
3 |
-
sys.path.append(os.path.realpath(os.path.dirname(inspect.getfile(inspect.currentframe())))+"/../")
|
4 |
-
from tornado.web import RequestHandler,Application
|
5 |
-
from tornado.ioloop import IOLoop
|
6 |
-
from tornado.httpserver import HTTPServer
|
7 |
-
from tornado.options import define,options
|
8 |
-
from util import es_conn, setup_logging
|
9 |
-
from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
|
10 |
-
from nlp import huqie
|
11 |
-
from nlp import query as Query
|
12 |
-
from nlp import search
|
13 |
-
from llm import HuEmbedding, GptTurbo
|
14 |
-
import numpy as np
|
15 |
-
from io import BytesIO
|
16 |
-
from util import config
|
17 |
-
from timeit import default_timer as timer
|
18 |
-
from collections import OrderedDict
|
19 |
-
from llm import ChatModel, EmbeddingModel
|
20 |
-
|
21 |
-
SE = None
|
22 |
-
CFIELD="content_ltks"
|
23 |
-
EMBEDDING = EmbeddingModel
|
24 |
-
LLM = ChatModel
|
25 |
-
|
26 |
-
def get_QA_pairs(hists):
|
27 |
-
pa = []
|
28 |
-
for h in hists:
|
29 |
-
for k in ["user", "assistant"]:
|
30 |
-
if h.get(k):
|
31 |
-
pa.append({
|
32 |
-
"content": h[k],
|
33 |
-
"role": k,
|
34 |
-
})
|
35 |
-
|
36 |
-
for p in pa[:-1]: assert len(p) == 2, p
|
37 |
-
return pa
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
def get_instruction(sres, top_i, max_len=8096, fld="content_ltks"):
|
42 |
-
max_len //= len(top_i)
|
43 |
-
# add instruction to prompt
|
44 |
-
instructions = [re.sub(r"[\r\n]+", " ", sres.field[sres.ids[i]][fld]) for i in top_i]
|
45 |
-
if len(instructions)>2:
|
46 |
-
# Said that LLM is sensitive to the first and the last one, so
|
47 |
-
# rearrange the order of references
|
48 |
-
instructions.append(copy.deepcopy(instructions[1]))
|
49 |
-
instructions.pop(1)
|
50 |
-
|
51 |
-
def token_num(txt):
|
52 |
-
c = 0
|
53 |
-
for tk in re.split(r"[,。/?‘’”“:;:;!!]", txt):
|
54 |
-
if re.match(r"[a-zA-Z-]+$", tk):
|
55 |
-
c += 1
|
56 |
-
continue
|
57 |
-
c += len(tk)
|
58 |
-
return c
|
59 |
-
|
60 |
-
_inst = ""
|
61 |
-
for ins in instructions:
|
62 |
-
if token_num(_inst) > 4096:
|
63 |
-
_inst += "\n知识库:" + instructions[-1][:max_len]
|
64 |
-
break
|
65 |
-
_inst += "\n知识库:" + ins[:max_len]
|
66 |
-
return _inst
|
67 |
-
|
68 |
-
|
69 |
-
def prompt_and_answer(history, inst):
|
70 |
-
hist = get_QA_pairs(history)
|
71 |
-
chks = []
|
72 |
-
for s in re.split(r"[::;;。\n\r]+", inst):
|
73 |
-
if s: chks.append(s)
|
74 |
-
chks = len(set(chks))/(0.1+len(chks))
|
75 |
-
print("Duplication portion:", chks)
|
76 |
-
|
77 |
-
system = """
|
78 |
-
你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答%s。当所有知识库内容都与问题无关时,你的回答必须包括"知识库中未找到您要的答案!这是我所知道的,仅作参考。"这句话。回答需要考虑聊天历史。
|
79 |
-
以下是知识库:
|
80 |
-
%s
|
81 |
-
以上是知识库。
|
82 |
-
"""%((",最好总结成表格" if chks<0.6 and chks>0 else ""), inst)
|
83 |
-
|
84 |
-
print("【PROMPT】:", system)
|
85 |
-
start = timer()
|
86 |
-
response = LLM.chat(system, hist, {"temperature": 0.2, "max_tokens": 512})
|
87 |
-
print("GENERATE: ", timer()-start)
|
88 |
-
print("===>>", response)
|
89 |
-
return response
|
90 |
-
|
91 |
-
|
92 |
-
class Handler(RequestHandler):
|
93 |
-
def post(self):
|
94 |
-
global SE,MUST_TK_NUM
|
95 |
-
param = json.loads(self.request.body.decode('utf-8'))
|
96 |
-
try:
|
97 |
-
question = param.get("history",[{"user": "Hi!"}])[-1]["user"]
|
98 |
-
res = SE.search({
|
99 |
-
"question": question,
|
100 |
-
"kb_ids": param.get("kb_ids", []),
|
101 |
-
"size": param.get("topn", 15)},
|
102 |
-
search.index_name(param["uid"])
|
103 |
-
)
|
104 |
-
|
105 |
-
sim = SE.rerank(res, question)
|
106 |
-
rk_idx = np.argsort(sim*-1)
|
107 |
-
topidx = [i for i in rk_idx if sim[i] >= aram.get("similarity", 0.5)][:param.get("topn",12)]
|
108 |
-
inst = get_instruction(res, topidx)
|
109 |
-
|
110 |
-
ans, topidx = prompt_and_answer(param["history"], inst)
|
111 |
-
ans = SE.insert_citations(ans, topidx, res)
|
112 |
-
|
113 |
-
refer = OrderedDict()
|
114 |
-
docnms = {}
|
115 |
-
for i in rk_idx:
|
116 |
-
did = res.field[res.ids[i]]["doc_id"]
|
117 |
-
if did not in docnms: docnms[did] = res.field[res.ids[i]]["docnm_kwd"]
|
118 |
-
if did not in refer: refer[did] = []
|
119 |
-
refer[did].append({
|
120 |
-
"chunk_id": res.ids[i],
|
121 |
-
"content": res.field[res.ids[i]]["content_ltks"],
|
122 |
-
"image": ""
|
123 |
-
})
|
124 |
-
|
125 |
-
print("::::::::::::::", ans)
|
126 |
-
self.write(json.dumps({
|
127 |
-
"code":0,
|
128 |
-
"msg":"success",
|
129 |
-
"data":{
|
130 |
-
"uid": param["uid"],
|
131 |
-
"dialog_id": param["dialog_id"],
|
132 |
-
"assistant": ans,
|
133 |
-
"refer": [{
|
134 |
-
"did": did,
|
135 |
-
"doc_name": docnms[did],
|
136 |
-
"chunks": chunks
|
137 |
-
} for did, chunks in refer.items()]
|
138 |
-
}
|
139 |
-
}))
|
140 |
-
logging.info("SUCCESS[%d]"%(res.total)+json.dumps(param, ensure_ascii=False))
|
141 |
-
|
142 |
-
except Exception as e:
|
143 |
-
logging.error("Request 500: "+str(e))
|
144 |
-
self.write(json.dumps({
|
145 |
-
"code":500,
|
146 |
-
"msg":str(e),
|
147 |
-
"data":{}
|
148 |
-
}))
|
149 |
-
print(traceback.format_exc())
|
150 |
-
|
151 |
-
|
152 |
-
if __name__ == '__main__':
|
153 |
-
parser = argparse.ArgumentParser()
|
154 |
-
parser.add_argument("--port", default=4455, type=int, help="Port used for service")
|
155 |
-
ARGS = parser.parse_args()
|
156 |
-
|
157 |
-
SE = search.Dealer(es_conn.HuEs("infiniflow"), EMBEDDING)
|
158 |
-
|
159 |
-
app = Application([(r'/v1/chat/completions', Handler)],debug=False)
|
160 |
-
http_server = HTTPServer(app)
|
161 |
-
http_server.bind(ARGS.port)
|
162 |
-
http_server.start(3)
|
163 |
-
|
164 |
-
IOLoop.current().start()
|
165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
python/svr/parse_user_docs.py
DELETED
@@ -1,258 +0,0 @@
|
|
1 |
-
import json, os, sys, hashlib, copy, time, random, re
|
2 |
-
from os.path import dirname, realpath
|
3 |
-
sys.path.append(dirname(realpath(__file__)) + "/../")
|
4 |
-
from util.es_conn import HuEs
|
5 |
-
from util.db_conn import Postgres
|
6 |
-
from util.minio_conn import HuMinio
|
7 |
-
from util import rmSpace, findMaxDt
|
8 |
-
from FlagEmbedding import FlagModel
|
9 |
-
from nlp import huchunk, huqie, search
|
10 |
-
from io import BytesIO
|
11 |
-
import pandas as pd
|
12 |
-
from elasticsearch_dsl import Q
|
13 |
-
from PIL import Image
|
14 |
-
from parser import (
|
15 |
-
PdfParser,
|
16 |
-
DocxParser,
|
17 |
-
ExcelParser
|
18 |
-
)
|
19 |
-
from nlp.huchunk import (
|
20 |
-
PdfChunker,
|
21 |
-
DocxChunker,
|
22 |
-
ExcelChunker,
|
23 |
-
PptChunker,
|
24 |
-
TextChunker
|
25 |
-
)
|
26 |
-
|
27 |
-
ES = HuEs("infiniflow")
|
28 |
-
BATCH_SIZE = 64
|
29 |
-
PG = Postgres("infiniflow", "docgpt")
|
30 |
-
MINIO = HuMinio("infiniflow")
|
31 |
-
|
32 |
-
PDF = PdfChunker(PdfParser())
|
33 |
-
DOC = DocxChunker(DocxParser())
|
34 |
-
EXC = ExcelChunker(ExcelParser())
|
35 |
-
PPT = PptChunker()
|
36 |
-
|
37 |
-
def chuck_doc(name, binary):
|
38 |
-
suff = os.path.split(name)[-1].lower().split(".")[-1]
|
39 |
-
if suff.find("pdf") >= 0: return PDF(binary)
|
40 |
-
if suff.find("doc") >= 0: return DOC(binary)
|
41 |
-
if re.match(r"(xlsx|xlsm|xltx|xltm)", suff): return EXC(binary)
|
42 |
-
if suff.find("ppt") >= 0: return PPT(binary)
|
43 |
-
if os.envirement.get("PARSE_IMAGE") \
|
44 |
-
and re.search(r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)$",
|
45 |
-
name.lower()):
|
46 |
-
from llm import CvModel
|
47 |
-
txt = CvModel.describe(binary)
|
48 |
-
field = TextChunker.Fields()
|
49 |
-
field.text_chunks = [(txt, binary)]
|
50 |
-
field.table_chunks = []
|
51 |
-
|
52 |
-
|
53 |
-
return TextChunker()(binary)
|
54 |
-
|
55 |
-
|
56 |
-
def collect(comm, mod, tm):
|
57 |
-
sql = f"""
|
58 |
-
select
|
59 |
-
id as kb2doc_id,
|
60 |
-
kb_id,
|
61 |
-
did,
|
62 |
-
updated_at,
|
63 |
-
is_deleted
|
64 |
-
from kb2_doc
|
65 |
-
where
|
66 |
-
updated_at >= '{tm}'
|
67 |
-
and kb_progress = 0
|
68 |
-
and MOD(did, {comm}) = {mod}
|
69 |
-
order by updated_at asc
|
70 |
-
limit 1000
|
71 |
-
"""
|
72 |
-
kb2doc = PG.select(sql)
|
73 |
-
if len(kb2doc) == 0:return pd.DataFrame()
|
74 |
-
|
75 |
-
sql = """
|
76 |
-
select
|
77 |
-
did,
|
78 |
-
uid,
|
79 |
-
doc_name,
|
80 |
-
location,
|
81 |
-
size
|
82 |
-
from doc_info
|
83 |
-
where
|
84 |
-
did in (%s)
|
85 |
-
"""%",".join([str(i) for i in kb2doc["did"].unique()])
|
86 |
-
docs = PG.select(sql)
|
87 |
-
docs = docs.fillna("")
|
88 |
-
docs = docs.join(kb2doc.set_index("did"), on="did", how="left")
|
89 |
-
|
90 |
-
mtm = str(docs["updated_at"].max())[:19]
|
91 |
-
print("TOTAL:", len(docs), "To: ", mtm)
|
92 |
-
return docs
|
93 |
-
|
94 |
-
|
95 |
-
def set_progress(kb2doc_id, prog, msg="Processing..."):
|
96 |
-
sql = f"""
|
97 |
-
update kb2_doc set kb_progress={prog}, kb_progress_msg='{msg}'
|
98 |
-
where
|
99 |
-
id={kb2doc_id}
|
100 |
-
"""
|
101 |
-
PG.update(sql)
|
102 |
-
|
103 |
-
|
104 |
-
def build(row):
|
105 |
-
if row["size"] > 256000000:
|
106 |
-
set_progress(row["kb2doc_id"], -1, "File size exceeds( <= 256Mb )")
|
107 |
-
return []
|
108 |
-
res = ES.search(Q("term", doc_id=row["did"]))
|
109 |
-
if ES.getTotal(res) > 0:
|
110 |
-
ES.updateScriptByQuery(Q("term", doc_id=row["did"]),
|
111 |
-
scripts="""
|
112 |
-
if(!ctx._source.kb_id.contains('%s'))
|
113 |
-
ctx._source.kb_id.add('%s');
|
114 |
-
"""%(str(row["kb_id"]), str(row["kb_id"])),
|
115 |
-
idxnm = search.index_name(row["uid"])
|
116 |
-
)
|
117 |
-
set_progress(row["kb2doc_id"], 1, "Done")
|
118 |
-
return []
|
119 |
-
|
120 |
-
random.seed(time.time())
|
121 |
-
set_progress(row["kb2doc_id"], random.randint(0, 20)/100., "Finished preparing! Start to slice file!")
|
122 |
-
try:
|
123 |
-
obj = chuck_doc(row["doc_name"], MINIO.get("%s-upload"%str(row["uid"]), row["location"]))
|
124 |
-
except Exception as e:
|
125 |
-
if re.search("(No such file|not found)", str(e)):
|
126 |
-
set_progress(row["kb2doc_id"], -1, "Can not find file <%s>"%row["doc_name"])
|
127 |
-
else:
|
128 |
-
set_progress(row["kb2doc_id"], -1, f"Internal system error: %s"%str(e).replace("'", ""))
|
129 |
-
return []
|
130 |
-
|
131 |
-
if not obj.text_chunks and not obj.table_chunks:
|
132 |
-
set_progress(row["kb2doc_id"], 1, "Nothing added! Mostly, file type unsupported yet.")
|
133 |
-
return []
|
134 |
-
|
135 |
-
set_progress(row["kb2doc_id"], random.randint(20, 60)/100., "Finished slicing files. Start to embedding the content.")
|
136 |
-
|
137 |
-
doc = {
|
138 |
-
"doc_id": row["did"],
|
139 |
-
"kb_id": [str(row["kb_id"])],
|
140 |
-
"docnm_kwd": os.path.split(row["location"])[-1],
|
141 |
-
"title_tks": huqie.qie(os.path.split(row["location"])[-1]),
|
142 |
-
"updated_at": str(row["updated_at"]).replace("T", " ")[:19]
|
143 |
-
}
|
144 |
-
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
145 |
-
output_buffer = BytesIO()
|
146 |
-
docs = []
|
147 |
-
md5 = hashlib.md5()
|
148 |
-
for txt, img in obj.text_chunks:
|
149 |
-
d = copy.deepcopy(doc)
|
150 |
-
md5.update((txt + str(d["doc_id"])).encode("utf-8"))
|
151 |
-
d["_id"] = md5.hexdigest()
|
152 |
-
d["content_ltks"] = huqie.qie(txt)
|
153 |
-
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
154 |
-
if not img:
|
155 |
-
docs.append(d)
|
156 |
-
continue
|
157 |
-
|
158 |
-
if isinstance(img, Image): img.save(output_buffer, format='JPEG')
|
159 |
-
else: output_buffer = BytesIO(img)
|
160 |
-
|
161 |
-
MINIO.put("{}-{}".format(row["uid"], row["kb_id"]), d["_id"],
|
162 |
-
output_buffer.getvalue())
|
163 |
-
d["img_id"] = "{}-{}".format(row["uid"], row["kb_id"])
|
164 |
-
docs.append(d)
|
165 |
-
|
166 |
-
for arr, img in obj.table_chunks:
|
167 |
-
for i, txt in enumerate(arr):
|
168 |
-
d = copy.deepcopy(doc)
|
169 |
-
d["content_ltks"] = huqie.qie(txt)
|
170 |
-
md5.update((txt + str(d["doc_id"])).encode("utf-8"))
|
171 |
-
d["_id"] = md5.hexdigest()
|
172 |
-
if not img:
|
173 |
-
docs.append(d)
|
174 |
-
continue
|
175 |
-
img.save(output_buffer, format='JPEG')
|
176 |
-
MINIO.put("{}-{}".format(row["uid"], row["kb_id"]), d["_id"],
|
177 |
-
output_buffer.getvalue())
|
178 |
-
d["img_id"] = "{}-{}".format(row["uid"], row["kb_id"])
|
179 |
-
docs.append(d)
|
180 |
-
set_progress(row["kb2doc_id"], random.randint(60, 70)/100., "Continue embedding the content.")
|
181 |
-
|
182 |
-
return docs
|
183 |
-
|
184 |
-
|
185 |
-
def init_kb(row):
|
186 |
-
idxnm = search.index_name(row["uid"])
|
187 |
-
if ES.indexExist(idxnm): return
|
188 |
-
return ES.createIdx(idxnm, json.load(open("conf/mapping.json", "r")))
|
189 |
-
|
190 |
-
|
191 |
-
model = None
|
192 |
-
def embedding(docs):
|
193 |
-
global model
|
194 |
-
tts = model.encode([rmSpace(d["title_tks"]) for d in docs])
|
195 |
-
cnts = model.encode([rmSpace(d["content_ltks"]) for d in docs])
|
196 |
-
vects = 0.1 * tts + 0.9 * cnts
|
197 |
-
assert len(vects) == len(docs)
|
198 |
-
for i,d in enumerate(docs):d["q_vec"] = vects[i].tolist()
|
199 |
-
|
200 |
-
|
201 |
-
def rm_doc_from_kb(df):
|
202 |
-
if len(df) == 0:return
|
203 |
-
for _,r in df.iterrows():
|
204 |
-
ES.updateScriptByQuery(Q("term", doc_id=r["did"]),
|
205 |
-
scripts="""
|
206 |
-
if(ctx._source.kb_id.contains('%s'))
|
207 |
-
ctx._source.kb_id.remove(
|
208 |
-
ctx._source.kb_id.indexOf('%s')
|
209 |
-
);
|
210 |
-
"""%(str(r["kb_id"]),str(r["kb_id"])),
|
211 |
-
idxnm = search.index_name(r["uid"])
|
212 |
-
)
|
213 |
-
if len(df) == 0:return
|
214 |
-
sql = """
|
215 |
-
delete from kb2_doc where id in (%s)
|
216 |
-
"""%",".join([str(i) for i in df["kb2doc_id"]])
|
217 |
-
PG.update(sql)
|
218 |
-
|
219 |
-
|
220 |
-
def main(comm, mod):
|
221 |
-
global model
|
222 |
-
from llm import HuEmbedding
|
223 |
-
model = HuEmbedding()
|
224 |
-
tm_fnm = f"res/{comm}-{mod}.tm"
|
225 |
-
tm = findMaxDt(tm_fnm)
|
226 |
-
rows = collect(comm, mod, tm)
|
227 |
-
if len(rows) == 0:return
|
228 |
-
|
229 |
-
rm_doc_from_kb(rows.loc[rows.is_deleted == True])
|
230 |
-
rows = rows.loc[rows.is_deleted == False].reset_index(drop=True)
|
231 |
-
if len(rows) == 0:return
|
232 |
-
tmf = open(tm_fnm, "a+")
|
233 |
-
for _, r in rows.iterrows():
|
234 |
-
cks = build(r)
|
235 |
-
if not cks:
|
236 |
-
tmf.write(str(r["updated_at"]) + "\n")
|
237 |
-
continue
|
238 |
-
## TODO: exception handler
|
239 |
-
## set_progress(r["did"], -1, "ERROR: ")
|
240 |
-
embedding(cks)
|
241 |
-
|
242 |
-
set_progress(r["kb2doc_id"], random.randint(70, 95)/100.,
|
243 |
-
"Finished embedding! Start to build index!")
|
244 |
-
init_kb(r)
|
245 |
-
es_r = ES.bulk(cks, search.index_name(r["uid"]))
|
246 |
-
if es_r:
|
247 |
-
set_progress(r["kb2doc_id"], -1, "Index failure!")
|
248 |
-
print(es_r)
|
249 |
-
else: set_progress(r["kb2doc_id"], 1., "Done!")
|
250 |
-
tmf.write(str(r["updated_at"]) + "\n")
|
251 |
-
tmf.close()
|
252 |
-
|
253 |
-
|
254 |
-
if __name__ == "__main__":
|
255 |
-
from mpi4py import MPI
|
256 |
-
comm = MPI.COMM_WORLD
|
257 |
-
main(comm.Get_size(), comm.Get_rank())
|
258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
python/tmp.log
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
----------- Model Configuration -----------
|
3 |
+
Model Arch: GFL
|
4 |
+
Transform Order:
|
5 |
+
--transform op: Resize
|
6 |
+
--transform op: NormalizeImage
|
7 |
+
--transform op: Permute
|
8 |
+
--transform op: PadStride
|
9 |
+
--------------------------------------------
|
10 |
+
Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
|
11 |
+
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.
|
12 |
+
Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
|
13 |
+
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
|
14 |
+
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
|
15 |
+
WARNING:root:The files are stored in /opt/home/kevinhu/docgpt/, please check it!
|
python/util/config.py
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
from configparser import ConfigParser
|
2 |
-
import os
|
3 |
-
import inspect
|
4 |
-
|
5 |
-
CF = ConfigParser()
|
6 |
-
__fnm = os.path.join(os.path.dirname(__file__), '../conf/sys.cnf')
|
7 |
-
if not os.path.exists(__fnm):
|
8 |
-
__fnm = os.path.join(os.path.dirname(__file__), '../../conf/sys.cnf')
|
9 |
-
assert os.path.exists(
|
10 |
-
__fnm), f"【EXCEPTION】can't find {__fnm}." + os.path.dirname(__file__)
|
11 |
-
if not os.path.exists(__fnm):
|
12 |
-
__fnm = "./sys.cnf"
|
13 |
-
|
14 |
-
CF.read(__fnm)
|
15 |
-
|
16 |
-
|
17 |
-
class Config:
|
18 |
-
def __init__(self, env):
|
19 |
-
self.env = env
|
20 |
-
if env == "spark":
|
21 |
-
CF.read("./cv.cnf")
|
22 |
-
|
23 |
-
def get(self, key, default=None):
|
24 |
-
global CF
|
25 |
-
return os.environ.get(key.upper(),
|
26 |
-
CF[self.env].get(key, default)
|
27 |
-
)
|
28 |
-
|
29 |
-
|
30 |
-
def init(env):
|
31 |
-
return Config(env)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
python/util/db_conn.py
DELETED
@@ -1,70 +0,0 @@
|
|
1 |
-
import logging
|
2 |
-
import time
|
3 |
-
from util import config
|
4 |
-
import pandas as pd
|
5 |
-
|
6 |
-
|
7 |
-
class Postgres(object):
|
8 |
-
def __init__(self, env, dbnm):
|
9 |
-
self.config = config.init(env)
|
10 |
-
self.conn = None
|
11 |
-
self.dbnm = dbnm
|
12 |
-
self.__open__()
|
13 |
-
|
14 |
-
def __open__(self):
|
15 |
-
import psycopg2
|
16 |
-
try:
|
17 |
-
if self.conn:
|
18 |
-
self.__close__()
|
19 |
-
del self.conn
|
20 |
-
except Exception as e:
|
21 |
-
pass
|
22 |
-
|
23 |
-
try:
|
24 |
-
self.conn = psycopg2.connect(f"""dbname={self.dbnm}
|
25 |
-
user={self.config.get('postgres_user')}
|
26 |
-
password={self.config.get('postgres_password')}
|
27 |
-
host={self.config.get('postgres_host')}
|
28 |
-
port={self.config.get('postgres_port')}""")
|
29 |
-
except Exception as e:
|
30 |
-
logging.error(
|
31 |
-
"Fail to connect %s " %
|
32 |
-
self.config.get("pgdb_host") + str(e))
|
33 |
-
|
34 |
-
def __close__(self):
|
35 |
-
try:
|
36 |
-
self.conn.close()
|
37 |
-
except Exception as e:
|
38 |
-
logging.error(
|
39 |
-
"Fail to close %s " %
|
40 |
-
self.config.get("pgdb_host") + str(e))
|
41 |
-
|
42 |
-
def select(self, sql):
|
43 |
-
for _ in range(10):
|
44 |
-
try:
|
45 |
-
return pd.read_sql(sql, self.conn)
|
46 |
-
except Exception as e:
|
47 |
-
logging.error(f"Fail to exec {sql} " + str(e))
|
48 |
-
self.__open__()
|
49 |
-
time.sleep(1)
|
50 |
-
|
51 |
-
return pd.DataFrame()
|
52 |
-
|
53 |
-
def update(self, sql):
|
54 |
-
for _ in range(10):
|
55 |
-
try:
|
56 |
-
cur = self.conn.cursor()
|
57 |
-
cur.execute(sql)
|
58 |
-
updated_rows = cur.rowcount
|
59 |
-
self.conn.commit()
|
60 |
-
cur.close()
|
61 |
-
return updated_rows
|
62 |
-
except Exception as e:
|
63 |
-
logging.error(f"Fail to exec {sql} " + str(e))
|
64 |
-
self.__open__()
|
65 |
-
time.sleep(1)
|
66 |
-
return 0
|
67 |
-
|
68 |
-
|
69 |
-
if __name__ == "__main__":
|
70 |
-
Postgres("infiniflow", "docgpt")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
python/util/setup_logging.py
DELETED
@@ -1,36 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import logging.config
|
3 |
-
import os
|
4 |
-
|
5 |
-
|
6 |
-
def log_dir():
|
7 |
-
fnm = os.path.join(os.path.dirname(__file__), '../log/')
|
8 |
-
if not os.path.exists(fnm):
|
9 |
-
fnm = os.path.join(os.path.dirname(__file__), '../../log/')
|
10 |
-
assert os.path.exists(fnm), f"Can't locate log dir: {fnm}"
|
11 |
-
return fnm
|
12 |
-
|
13 |
-
|
14 |
-
def setup_logging(default_path="conf/logging.json",
|
15 |
-
default_level=logging.INFO,
|
16 |
-
env_key="LOG_CFG"):
|
17 |
-
path = default_path
|
18 |
-
value = os.getenv(env_key, None)
|
19 |
-
if value:
|
20 |
-
path = value
|
21 |
-
if os.path.exists(path):
|
22 |
-
with open(path, "r") as f:
|
23 |
-
config = json.load(f)
|
24 |
-
fnm = log_dir()
|
25 |
-
|
26 |
-
config["handlers"]["info_file_handler"]["filename"] = fnm + "info.log"
|
27 |
-
config["handlers"]["error_file_handler"]["filename"] = fnm + "error.log"
|
28 |
-
logging.config.dictConfig(config)
|
29 |
-
else:
|
30 |
-
logging.basicConfig(level=default_level)
|
31 |
-
|
32 |
-
|
33 |
-
__fnm = os.path.join(os.path.dirname(__file__), 'conf/logging.json')
|
34 |
-
if not os.path.exists(__fnm):
|
35 |
-
__fnm = os.path.join(os.path.dirname(__file__), '../../conf/logging.json')
|
36 |
-
setup_logging(__fnm)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rag/__init__.py
ADDED
File without changes
|
rag/llm/__init__.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright 2019 The FATE Authors. All Rights Reserved.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
#
|
16 |
+
from .embedding_model import *
|
17 |
+
from .chat_model import *
|
18 |
+
from .cv_model import *
|
19 |
+
|
20 |
+
|
21 |
+
EmbeddingModel = {
|
22 |
+
"local": HuEmbedding,
|
23 |
+
"OpenAI": OpenAIEmbed,
|
24 |
+
"通义千问": QWenEmbed,
|
25 |
+
}
|
26 |
+
|
27 |
+
|
28 |
+
CvModel = {
|
29 |
+
"OpenAI": GptV4,
|
30 |
+
"通义千问": QWenCV,
|
31 |
+
}
|
32 |
+
|
{python → rag}/llm/chat_model.py
RENAMED
@@ -1,3 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from abc import ABC
|
2 |
from openai import OpenAI
|
3 |
import os
|
|
|
1 |
+
#
|
2 |
+
# Copyright 2019 The FATE Authors. All Rights Reserved.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
#
|
16 |
from abc import ABC
|
17 |
from openai import OpenAI
|
18 |
import os
|
{python → rag}/llm/cv_model.py
RENAMED
@@ -1,3 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from abc import ABC
|
2 |
from openai import OpenAI
|
3 |
import os
|
@@ -6,6 +21,9 @@ from io import BytesIO
|
|
6 |
|
7 |
|
8 |
class Base(ABC):
|
|
|
|
|
|
|
9 |
def describe(self, image, max_tokens=300):
|
10 |
raise NotImplementedError("Please implement encode method!")
|
11 |
|
@@ -40,14 +58,15 @@ class Base(ABC):
|
|
40 |
|
41 |
|
42 |
class GptV4(Base):
|
43 |
-
def __init__(self):
|
44 |
-
self.client = OpenAI(
|
|
|
45 |
|
46 |
def describe(self, image, max_tokens=300):
|
47 |
b64 = self.image2base64(image)
|
48 |
|
49 |
res = self.client.chat.completions.create(
|
50 |
-
model=
|
51 |
messages=self.prompt(b64),
|
52 |
max_tokens=max_tokens,
|
53 |
)
|
@@ -55,11 +74,15 @@ class GptV4(Base):
|
|
55 |
|
56 |
|
57 |
class QWenCV(Base):
|
|
|
|
|
|
|
|
|
|
|
58 |
def describe(self, image, max_tokens=300):
|
59 |
from http import HTTPStatus
|
60 |
from dashscope import MultiModalConversation
|
61 |
-
|
62 |
-
response = MultiModalConversation.call(model=MultiModalConversation.Models.qwen_vl_chat_v1,
|
63 |
messages=self.prompt(self.image2base64(image)))
|
64 |
if response.status_code == HTTPStatus.OK:
|
65 |
return response.output.choices[0]['message']['content']
|
|
|
1 |
+
#
|
2 |
+
# Copyright 2019 The FATE Authors. All Rights Reserved.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
#
|
16 |
from abc import ABC
|
17 |
from openai import OpenAI
|
18 |
import os
|
|
|
21 |
|
22 |
|
23 |
class Base(ABC):
|
24 |
+
def __init__(self, key, model_name):
|
25 |
+
pass
|
26 |
+
|
27 |
def describe(self, image, max_tokens=300):
|
28 |
raise NotImplementedError("Please implement encode method!")
|
29 |
|
|
|
58 |
|
59 |
|
60 |
class GptV4(Base):
|
61 |
+
def __init__(self, key, model_name="gpt-4-vision-preview"):
|
62 |
+
self.client = OpenAI(key)
|
63 |
+
self.model_name = model_name
|
64 |
|
65 |
def describe(self, image, max_tokens=300):
|
66 |
b64 = self.image2base64(image)
|
67 |
|
68 |
res = self.client.chat.completions.create(
|
69 |
+
model=self.model_name,
|
70 |
messages=self.prompt(b64),
|
71 |
max_tokens=max_tokens,
|
72 |
)
|
|
|
74 |
|
75 |
|
76 |
class QWenCV(Base):
|
77 |
+
def __init__(self, key, model_name="qwen-vl-chat-v1"):
|
78 |
+
import dashscope
|
79 |
+
dashscope.api_key = key
|
80 |
+
self.model_name = model_name
|
81 |
+
|
82 |
def describe(self, image, max_tokens=300):
|
83 |
from http import HTTPStatus
|
84 |
from dashscope import MultiModalConversation
|
85 |
+
response = MultiModalConversation.call(model=self.model_name,
|
|
|
86 |
messages=self.prompt(self.image2base64(image)))
|
87 |
if response.status_code == HTTPStatus.OK:
|
88 |
return response.output.choices[0]['message']['content']
|
{python → rag}/llm/embedding_model.py
RENAMED
@@ -1,12 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from abc import ABC
|
|
|
|
|
2 |
from openai import OpenAI
|
3 |
from FlagEmbedding import FlagModel
|
4 |
import torch
|
5 |
import os
|
6 |
import numpy as np
|
7 |
|
|
|
|
|
8 |
|
9 |
class Base(ABC):
|
|
|
|
|
|
|
|
|
10 |
def encode(self, texts: list, batch_size=32):
|
11 |
raise NotImplementedError("Please implement encode method!")
|
12 |
|
@@ -28,34 +51,44 @@ class HuEmbedding(Base):
|
|
28 |
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
29 |
use_fp16=torch.cuda.is_available())
|
30 |
|
|
|
31 |
def encode(self, texts: list, batch_size=32):
|
|
|
|
|
32 |
res = []
|
33 |
for i in range(0, len(texts), batch_size):
|
34 |
res.extend(self.model.encode(texts[i:i + batch_size]).tolist())
|
35 |
-
return np.array(res)
|
36 |
|
37 |
|
38 |
-
class
|
39 |
-
def __init__(self):
|
40 |
-
self.client = OpenAI(
|
|
|
41 |
|
42 |
def encode(self, texts: list, batch_size=32):
|
|
|
|
|
43 |
res = self.client.embeddings.create(input=texts,
|
44 |
-
model=
|
45 |
-
return [d["embedding"] for d in res["data"]]
|
|
|
46 |
|
|
|
|
|
|
|
|
|
47 |
|
48 |
-
class QWenEmbd(Base):
|
49 |
def encode(self, texts: list, batch_size=32, text_type="document"):
|
50 |
-
# export DASHSCOPE_API_KEY=YOUR_DASHSCOPE_API_KEY
|
51 |
import dashscope
|
52 |
-
from http import HTTPStatus
|
53 |
res = []
|
|
|
54 |
for txt in texts:
|
55 |
resp = dashscope.TextEmbedding.call(
|
56 |
-
model=
|
57 |
input=txt[:2048],
|
58 |
text_type=text_type
|
59 |
)
|
60 |
res.append(resp["output"]["embeddings"][0]["embedding"])
|
61 |
-
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright 2019 The FATE Authors. All Rights Reserved.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
#
|
16 |
from abc import ABC
|
17 |
+
|
18 |
+
import dashscope
|
19 |
from openai import OpenAI
|
20 |
from FlagEmbedding import FlagModel
|
21 |
import torch
|
22 |
import os
|
23 |
import numpy as np
|
24 |
|
25 |
+
from rag.utils import num_tokens_from_string
|
26 |
+
|
27 |
|
28 |
class Base(ABC):
|
29 |
+
def __init__(self, key, model_name):
|
30 |
+
pass
|
31 |
+
|
32 |
+
|
33 |
def encode(self, texts: list, batch_size=32):
|
34 |
raise NotImplementedError("Please implement encode method!")
|
35 |
|
|
|
51 |
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
52 |
use_fp16=torch.cuda.is_available())
|
53 |
|
54 |
+
|
55 |
def encode(self, texts: list, batch_size=32):
|
56 |
+
token_count = 0
|
57 |
+
for t in texts: token_count += num_tokens_from_string(t)
|
58 |
res = []
|
59 |
for i in range(0, len(texts), batch_size):
|
60 |
res.extend(self.model.encode(texts[i:i + batch_size]).tolist())
|
61 |
+
return np.array(res), token_count
|
62 |
|
63 |
|
64 |
+
class OpenAIEmbed(Base):
|
65 |
+
def __init__(self, key, model_name="text-embedding-ada-002"):
|
66 |
+
self.client = OpenAI(key)
|
67 |
+
self.model_name = model_name
|
68 |
|
69 |
def encode(self, texts: list, batch_size=32):
|
70 |
+
token_count = 0
|
71 |
+
for t in texts: token_count += num_tokens_from_string(t)
|
72 |
res = self.client.embeddings.create(input=texts,
|
73 |
+
model=self.model_name)
|
74 |
+
return [d["embedding"] for d in res["data"]], token_count
|
75 |
+
|
76 |
|
77 |
+
class QWenEmbed(Base):
|
78 |
+
def __init__(self, key, model_name="text_embedding_v2"):
|
79 |
+
dashscope.api_key = key
|
80 |
+
self.model_name = model_name
|
81 |
|
|
|
82 |
def encode(self, texts: list, batch_size=32, text_type="document"):
|
|
|
83 |
import dashscope
|
|
|
84 |
res = []
|
85 |
+
token_count = 0
|
86 |
for txt in texts:
|
87 |
resp = dashscope.TextEmbedding.call(
|
88 |
+
model=self.model_name,
|
89 |
input=txt[:2048],
|
90 |
text_type=text_type
|
91 |
)
|
92 |
res.append(resp["output"]["embeddings"][0]["embedding"])
|
93 |
+
token_count += resp["usage"]["total_tokens"]
|
94 |
+
return res, token_count
|
rag/nlp/__init__.py
ADDED
File without changes
|
{python → rag}/nlp/huchunk.py
RENAMED
File without changes
|
{python → rag}/nlp/huqie.py
RENAMED
@@ -9,6 +9,8 @@ import string
|
|
9 |
import sys
|
10 |
from hanziconv import HanziConv
|
11 |
|
|
|
|
|
12 |
|
13 |
class Huqie:
|
14 |
def key_(self, line):
|
@@ -41,14 +43,7 @@ class Huqie:
|
|
41 |
self.DEBUG = debug
|
42 |
self.DENOMINATOR = 1000000
|
43 |
self.trie_ = datrie.Trie(string.printable)
|
44 |
-
self.DIR_ = ""
|
45 |
-
if os.path.exists("../res/huqie.txt"):
|
46 |
-
self.DIR_ = "../res/huqie"
|
47 |
-
if os.path.exists("./res/huqie.txt"):
|
48 |
-
self.DIR_ = "./res/huqie"
|
49 |
-
if os.path.exists("./huqie.txt"):
|
50 |
-
self.DIR_ = "./huqie"
|
51 |
-
assert self.DIR_, f"【Can't find huqie】"
|
52 |
|
53 |
self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
|
54 |
try:
|
|
|
9 |
import sys
|
10 |
from hanziconv import HanziConv
|
11 |
|
12 |
+
from web_server.utils.file_utils import get_project_base_directory
|
13 |
+
|
14 |
|
15 |
class Huqie:
|
16 |
def key_(self, line):
|
|
|
43 |
self.DEBUG = debug
|
44 |
self.DENOMINATOR = 1000000
|
45 |
self.trie_ = datrie.Trie(string.printable)
|
46 |
+
self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
|
49 |
try:
|
{python → rag}/nlp/query.py
RENAMED
@@ -1,12 +1,12 @@
|
|
|
|
|
|
1 |
import json
|
2 |
import re
|
3 |
-
import sys
|
4 |
-
import os
|
5 |
import logging
|
6 |
import copy
|
7 |
import math
|
8 |
from elasticsearch_dsl import Q, Search
|
9 |
-
from nlp import huqie, term_weight, synonym
|
10 |
|
11 |
|
12 |
class EsQueryer:
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
import json
|
4 |
import re
|
|
|
|
|
5 |
import logging
|
6 |
import copy
|
7 |
import math
|
8 |
from elasticsearch_dsl import Q, Search
|
9 |
+
from rag.nlp import huqie, term_weight, synonym
|
10 |
|
11 |
|
12 |
class EsQueryer:
|
{python → rag}/nlp/search.py
RENAMED
@@ -1,13 +1,11 @@
|
|
|
|
1 |
import re
|
2 |
from elasticsearch_dsl import Q, Search, A
|
3 |
from typing import List, Optional, Tuple, Dict, Union
|
4 |
from dataclasses import dataclass
|
5 |
-
from
|
6 |
-
from nlp import huqie, query
|
7 |
-
from datetime import datetime
|
8 |
-
from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
|
9 |
import numpy as np
|
10 |
-
from copy import deepcopy
|
11 |
|
12 |
|
13 |
def index_name(uid): return f"docgpt_{uid}"
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
import re
|
3 |
from elasticsearch_dsl import Q, Search, A
|
4 |
from typing import List, Optional, Tuple, Dict, Union
|
5 |
from dataclasses import dataclass
|
6 |
+
from rag.utils import rmSpace
|
7 |
+
from rag.nlp import huqie, query
|
|
|
|
|
8 |
import numpy as np
|
|
|
9 |
|
10 |
|
11 |
def index_name(uid): return f"docgpt_{uid}"
|
{python → rag}/nlp/synonym.py
RENAMED
@@ -1,8 +1,11 @@
|
|
1 |
import json
|
|
|
2 |
import time
|
3 |
import logging
|
4 |
import re
|
5 |
|
|
|
|
|
6 |
|
7 |
class Dealer:
|
8 |
def __init__(self, redis=None):
|
@@ -10,18 +13,12 @@ class Dealer:
|
|
10 |
self.lookup_num = 100000000
|
11 |
self.load_tm = time.time() - 1000000
|
12 |
self.dictionary = None
|
|
|
13 |
try:
|
14 |
-
self.dictionary = json.load(open(
|
15 |
-
except Exception as e:
|
16 |
-
pass
|
17 |
-
try:
|
18 |
-
self.dictionary = json.load(open("./res/synonym.json", 'r'))
|
19 |
except Exception as e:
|
20 |
-
|
21 |
-
|
22 |
-
except Exception as e:
|
23 |
-
logging.warn("Miss synonym.json")
|
24 |
-
self.dictionary = {}
|
25 |
|
26 |
if not redis:
|
27 |
logging.warning(
|
|
|
1 |
import json
|
2 |
+
import os
|
3 |
import time
|
4 |
import logging
|
5 |
import re
|
6 |
|
7 |
+
from web_server.utils.file_utils import get_project_base_directory
|
8 |
+
|
9 |
|
10 |
class Dealer:
|
11 |
def __init__(self, redis=None):
|
|
|
13 |
self.lookup_num = 100000000
|
14 |
self.load_tm = time.time() - 1000000
|
15 |
self.dictionary = None
|
16 |
+
path = os.path.join(get_project_base_directory(), "rag/res", "synonym.json")
|
17 |
try:
|
18 |
+
self.dictionary = json.load(open(path, 'r'))
|
|
|
|
|
|
|
|
|
19 |
except Exception as e:
|
20 |
+
logging.warn("Miss synonym.json")
|
21 |
+
self.dictionary = {}
|
|
|
|
|
|
|
22 |
|
23 |
if not redis:
|
24 |
logging.warning(
|
{python → rag}/nlp/term_weight.py
RENAMED
@@ -1,9 +1,11 @@
|
|
|
|
1 |
import math
|
2 |
import json
|
3 |
import re
|
4 |
import os
|
5 |
import numpy as np
|
6 |
-
from nlp import huqie
|
|
|
7 |
|
8 |
|
9 |
class Dealer:
|
@@ -60,16 +62,14 @@ class Dealer:
|
|
60 |
return set(res.keys())
|
61 |
return res
|
62 |
|
63 |
-
fnm = os.path.join(
|
64 |
-
if not os.path.exists(fnm):
|
65 |
-
fnm = os.path.join(os.path.dirname(__file__), '../../res/')
|
66 |
self.ne, self.df = {}, {}
|
67 |
try:
|
68 |
-
self.ne = json.load(open(fnm
|
69 |
except Exception as e:
|
70 |
print("[WARNING] Load ner.json FAIL!")
|
71 |
try:
|
72 |
-
self.df = load_dict(fnm
|
73 |
except Exception as e:
|
74 |
print("[WARNING] Load term.freq FAIL!")
|
75 |
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
import math
|
3 |
import json
|
4 |
import re
|
5 |
import os
|
6 |
import numpy as np
|
7 |
+
from rag.nlp import huqie
|
8 |
+
from web_server.utils.file_utils import get_project_base_directory
|
9 |
|
10 |
|
11 |
class Dealer:
|
|
|
62 |
return set(res.keys())
|
63 |
return res
|
64 |
|
65 |
+
fnm = os.path.join(get_project_base_directory(), "res")
|
|
|
|
|
66 |
self.ne, self.df = {}, {}
|
67 |
try:
|
68 |
+
self.ne = json.load(open(os.path.join(fnm, "ner.json"), "r"))
|
69 |
except Exception as e:
|
70 |
print("[WARNING] Load ner.json FAIL!")
|
71 |
try:
|
72 |
+
self.df = load_dict(os.path.join(fnm, "term.freq"))
|
73 |
except Exception as e:
|
74 |
print("[WARNING] Load term.freq FAIL!")
|
75 |
|
{python → rag}/parser/__init__.py
RENAMED
File without changes
|
{python → rag}/parser/docx_parser.py
RENAMED
@@ -1,8 +1,9 @@
|
|
|
|
1 |
from docx import Document
|
2 |
import re
|
3 |
import pandas as pd
|
4 |
from collections import Counter
|
5 |
-
from nlp import huqie
|
6 |
from io import BytesIO
|
7 |
|
8 |
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
from docx import Document
|
3 |
import re
|
4 |
import pandas as pd
|
5 |
from collections import Counter
|
6 |
+
from rag.nlp import huqie
|
7 |
from io import BytesIO
|
8 |
|
9 |
|
{python → rag}/parser/excel_parser.py
RENAMED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from openpyxl import load_workbook
|
2 |
import sys
|
3 |
from io import BytesIO
|
@@ -12,11 +13,18 @@ class HuExcelParser:
|
|
12 |
res = []
|
13 |
for sheetname in wb.sheetnames:
|
14 |
ws = wb[sheetname]
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
return res
|
21 |
|
22 |
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
from openpyxl import load_workbook
|
3 |
import sys
|
4 |
from io import BytesIO
|
|
|
13 |
res = []
|
14 |
for sheetname in wb.sheetnames:
|
15 |
ws = wb[sheetname]
|
16 |
+
rows = list(ws.rows)
|
17 |
+
ti = list(rows[0])
|
18 |
+
for r in list(rows[1:]):
|
19 |
+
l = []
|
20 |
+
for i,c in enumerate(r):
|
21 |
+
if not c.value:continue
|
22 |
+
t = str(ti[i].value) if i < len(ti) else ""
|
23 |
+
t += (":" if t else "") + str(c.value)
|
24 |
+
l.append(t)
|
25 |
+
l = "; ".join(l)
|
26 |
+
if sheetname.lower().find("sheet") <0: l += " ——"+sheetname
|
27 |
+
res.append(l)
|
28 |
return res
|
29 |
|
30 |
|
{python → rag}/parser/pdf_parser.py
RENAMED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import xgboost as xgb
|
2 |
from io import BytesIO
|
3 |
import torch
|
@@ -6,11 +7,11 @@ import pdfplumber
|
|
6 |
import logging
|
7 |
from PIL import Image
|
8 |
import numpy as np
|
9 |
-
from nlp import huqie
|
10 |
from collections import Counter
|
11 |
from copy import deepcopy
|
12 |
-
from cv.table_recognize import TableTransformer
|
13 |
-
from cv.ppdetection import PPDet
|
14 |
from huggingface_hub import hf_hub_download
|
15 |
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
16 |
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
import xgboost as xgb
|
3 |
from io import BytesIO
|
4 |
import torch
|
|
|
7 |
import logging
|
8 |
from PIL import Image
|
9 |
import numpy as np
|
10 |
+
from rag.nlp import huqie
|
11 |
from collections import Counter
|
12 |
from copy import deepcopy
|
13 |
+
from rag.cv.table_recognize import TableTransformer
|
14 |
+
from rag.cv.ppdetection import PPDet
|
15 |
from huggingface_hub import hf_hub_download
|
16 |
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
17 |
|
{python → rag}/res/huqie.txt
RENAMED
File without changes
|
{python → rag}/res/ner.json
RENAMED
File without changes
|