KevinHuSh commited on
Commit
3079197
·
1 Parent(s): db8cae3

build python version rag-flow (#21)

Browse files

* clean rust version project

* clean rust version project

* build python version rag-flow

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env.template +0 -9
  2. Cargo.toml +0 -42
  3. {python/conf → conf}/mapping.json +0 -0
  4. conf/private.pem +30 -0
  5. conf/public.pem +9 -0
  6. conf/service_conf.yaml +28 -0
  7. docker/.env +0 -21
  8. docker/docker-compose.yml +30 -17
  9. docker/init.sql +2 -0
  10. migration/Cargo.toml +0 -20
  11. migration/README.md +0 -41
  12. migration/src/lib.rs +0 -12
  13. migration/src/m20220101_000001_create_table.rs +0 -440
  14. migration/src/main.rs +0 -6
  15. python/Dockerfile +29 -0
  16. python/README.md +0 -22
  17. python/{nlp/__init__.py → ToPDF.pdf} +0 -0
  18. python/] +63 -0
  19. python/conf/logging.json +0 -41
  20. python/conf/sys.cnf +0 -9
  21. python/llm/__init__.py +0 -21
  22. python/output/ToPDF.pdf +0 -0
  23. python/requirements.txt +0 -194
  24. python/res/1-0.tm +8 -0
  25. python/res/thumbnail-1-0.tm +3 -0
  26. python/svr/add_thumbnail2file.py +0 -118
  27. python/svr/dialog_svr.py +0 -165
  28. python/svr/parse_user_docs.py +0 -258
  29. python/tmp.log +15 -0
  30. python/util/config.py +0 -31
  31. python/util/db_conn.py +0 -70
  32. python/util/setup_logging.py +0 -36
  33. rag/__init__.py +0 -0
  34. rag/llm/__init__.py +32 -0
  35. {python → rag}/llm/chat_model.py +15 -0
  36. {python → rag}/llm/cv_model.py +28 -5
  37. {python → rag}/llm/embedding_model.py +44 -11
  38. rag/nlp/__init__.py +0 -0
  39. {python → rag}/nlp/huchunk.py +0 -0
  40. {python → rag}/nlp/huqie.py +3 -8
  41. {python → rag}/nlp/query.py +3 -3
  42. {python → rag}/nlp/search.py +3 -5
  43. {python → rag}/nlp/synonym.py +7 -10
  44. {python → rag}/nlp/term_weight.py +6 -6
  45. {python → rag}/parser/__init__.py +0 -0
  46. {python → rag}/parser/docx_parser.py +2 -1
  47. {python → rag}/parser/excel_parser.py +13 -5
  48. {python → rag}/parser/pdf_parser.py +4 -3
  49. {python → rag}/res/huqie.txt +0 -0
  50. {python → rag}/res/ner.json +0 -0
.env.template DELETED
@@ -1,9 +0,0 @@
1
- # Database
2
- HOST=127.0.0.1
3
- PORT=8000
4
- DATABASE_URL="postgresql://infiniflow:infiniflow@localhost/docgpt"
5
-
6
- # S3 Storage
7
- MINIO_HOST="127.0.0.1:9000"
8
- MINIO_USR="infiniflow"
9
- MINIO_PWD="infiniflow_docgpt"
 
 
 
 
 
 
 
 
 
 
Cargo.toml DELETED
@@ -1,42 +0,0 @@
1
- [package]
2
- name = "doc_gpt"
3
- version = "0.1.0"
4
- edition = "2021"
5
-
6
- # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
7
-
8
- [dependencies]
9
- actix-web = "4.3.1"
10
- actix-rt = "2.8.0"
11
- actix-files = "0.6.2"
12
- actix-multipart = "0.4"
13
- actix-session = { version = "0.5" }
14
- actix-identity = { version = "0.4" }
15
- actix-web-httpauth = { version = "0.6" }
16
- actix-ws = "0.2.5"
17
- uuid = { version = "1.6.1", features = [
18
- "v4",
19
- "fast-rng",
20
- "macro-diagnostics",
21
- ] }
22
- thiserror = "1.0"
23
- postgres = "0.19.7"
24
- sea-orm = { version = "0.12.9", features = ["sqlx-postgres", "runtime-tokio-native-tls", "macros"] }
25
- serde = { version = "1", features = ["derive"] }
26
- serde_json = "1.0"
27
- tracing-subscriber = "0.3.18"
28
- dotenvy = "0.15.7"
29
- listenfd = "1.0.1"
30
- chrono = "0.4.31"
31
- migration = { path = "./migration" }
32
- minio = "0.1.0"
33
- futures-util = "0.3.29"
34
- actix-multipart-extract = "0.1.5"
35
- regex = "1.10.2"
36
- tokio = { version = "1.35.1", features = ["rt", "time", "macros"] }
37
-
38
- [[bin]]
39
- name = "doc_gpt"
40
-
41
- [workspace]
42
- members = [".", "migration"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
{python/conf → conf}/mapping.json RENAMED
File without changes
conf/private.pem ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN RSA PRIVATE KEY-----
2
+ Proc-Type: 4,ENCRYPTED
3
+ DEK-Info: DES-EDE3-CBC,EFF8327C41E531AD
4
+
5
+ 7jdPFDAA6fiTzOIU7XGzKuT324JKZEcK5vBRJqBkA5XO6ENN1wLdhh3zQbl1Ejfv
6
+ KMSUIgbtQEJB4bvOzS//okbZa1vCNYuTS/NGcpKUnhqdOmAL3hl/kOtOLLjTZrwo
7
+ 3KX8iujLH7wQ64GxArtpUuaFq1k0whN1BB5RGJp3IO/L6pMpSWVRKO+JPUrD1Ujr
8
+ XA/LUKQJaZtXVUVOYPtIwbyqPsh93QBetJnRwwV3gNOwGpcX2jDpyTxDUkLJCPPg
9
+ 6Hw0pwlQEd8A11sjxCBbASwLeJO1L0w69QiX9chyOkZ+sfDsVpPt/wf1NexA7Cdj
10
+ 9uifJ4JGbby39QD6mInZGtnRzQRdafjuXlBR2I0Qa7fBRu8QsfhmLbWZfWno7j08
11
+ 4bAAoqB1vRNfSu8LVJXdEEh/HKuwu11pgRr5eH8WQ3hJg+Y2k7zDHpp1VaHL7/Kn
12
+ S+aN5bhQ4Xt0Ujdi1+rsmNchnF6LWsDezHWJeWUM6X7dJnqIBl8oCyghbghT8Tyw
13
+ aEKWXc2+7FsP5yd0NfG3PFYOLdLgfI43pHTAv5PEQ47w9r1XOwfblKKBUDEzaput
14
+ T3t5wQ6wxdyhRxeO4arCHfe/i+j3fzvhlwgbuwrmrkWGWSS86eMTaoGM8+uUrHv0
15
+ 6TbU0tj6DKKUslVk1dCHh9TnmNsXZuLJkceZF38PSKNxhzudU8OTtzhS0tFL91HX
16
+ vo7N+XdiGMs8oOSpjE6RPlhFhVAKGJpXwBj/vXLLcmzesA7ZB2kYtFKMIdsUQpls
17
+ PE/4K5PEX2d8pxA5zxo0HleA1YjW8i5WEcDQThZQzj2sWvg06zSjenVFrbCm9Bro
18
+ hFpAB/3zJHxdRN2MpNpvK35WITy1aDUdX1WdyrlcRtIE5ssFTSoxSj9ibbDZ78+z
19
+ gtbw/MUi6vU6Yz1EjvoYu/bmZAHt9Aagcxw6k58fjO2cEB9njK7xbbiZUSwpJhEe
20
+ U/PxK+SdOU/MmGKeqdgqSfhJkq0vhacvsEjFGRAfivSCHkL0UjhObU+rSJ3g1RMO
21
+ oukAev6TOAwbTKVWjg3/EX+pl/zorAgaPNYFX64TSH4lE3VjeWApITb9Z5C/sVxR
22
+ xW6hU9qyjzWYWY+91y16nkw1l7VQvWHUZwV7QzTScC2BOzDVpeqY1KiYJxgoo6sX
23
+ ZCqR5oh4vToG4W8ZrRyauwUaZJ3r+zhAgm+6n6TJQNwFEl0muji+1nPl32EiFsRs
24
+ qR6CtuhUOVQM4VnILDwFJfuGYRFtKzQgvseLNU4ZqAVqQj8l4ARGAP2P1Au/uUKy
25
+ oGzI7a+b5MvRHuvkxPAclOgXgX/8yyOLaBg+mgaqv9h2JIJD28PzouFl3BajRaVB
26
+ 7GWTnROJYhX5SuX/g585SLRKoQUtK0WhdJCjTRfyRJPwfdppgdTbWO99R4G+ir02
27
+ JQdSkZf2vmZRXenPNTEPDOUY6nVN6sUuBjmtOwoUF194ODgpYB6IaHqK08sa1pUh
28
+ 1mZyxitHdPbygePTe20XWMZFoK2knAqN0JPPbbNjCqiVV+7oqQAnkDIutspu9t2m
29
+ ny3jefFmNozbblQMghLUrq+x9wOEgvS76Sqvq3DG/2BkLzJF3MNkvw==
30
+ -----END RSA PRIVATE KEY-----
conf/public.pem ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN PUBLIC KEY-----
2
+ MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEArq9XTUSeYr2+N1h3Afl/
3
+ z8Dse/2yD0ZGrKwx+EEEcdsBLca9Ynmx3nIB5obmLlSfmskLpBo0UACBmB5rEjBp
4
+ 2Q2f3AG3Hjd4B+gNCG6BDaawuDlgANIhGnaTLrIqWrrcm4EMzJOnAOI1fgzJRsOO
5
+ UEfaS318Eq9OVO3apEyCCt0lOQK6PuksduOjVxtltDav+guVAA068NrPYmRNabVK
6
+ RNLJpL8w4D44sfth5RvZ3q9t+6RTArpEtc5sh5ChzvqPOzKGMXW83C95TxmXqpbK
7
+ 6olN4RevSfVjEAgCydH6HN6OhtOQEcnrU97r9H0iZOWwbw3pVrZiUkuRD1R56Wzs
8
+ 2wIDAQAB
9
+ -----END PUBLIC KEY-----
conf/service_conf.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ authentication:
2
+ client:
3
+ switch: false
4
+ http_app_key:
5
+ http_secret_key:
6
+ site:
7
+ switch: false
8
+ permission:
9
+ switch: false
10
+ component: false
11
+ dataset: false
12
+ ragflow:
13
+ # you must set real ip address, 127.0.0.1 and 0.0.0.0 is not supported
14
+ host: 127.0.0.1
15
+ http_port: 9380
16
+ database:
17
+ name: 'rag_flow'
18
+ user: 'root'
19
+ passwd: 'infini_rag_flow'
20
+ host: '123.60.95.134'
21
+ port: 5455
22
+ max_connections: 100
23
+ stale_timeout: 30
24
+ oauth:
25
+ github:
26
+ client_id: 302129228f0d96055bee
27
+ secret_key: e518e55ccfcdfcae8996afc40f110e9c95f14fc4
28
+ url: https://github.com/login/oauth/access_token
docker/.env DELETED
@@ -1,21 +0,0 @@
1
- # Version of Elastic products
2
- STACK_VERSION=8.11.3
3
-
4
- # Set the cluster name
5
- CLUSTER_NAME=docgpt
6
-
7
- # Port to expose Elasticsearch HTTP API to the host
8
- ES_PORT=9200
9
-
10
- # Port to expose Kibana to the host
11
- KIBANA_PORT=6601
12
-
13
- # Increase or decrease based on the available host memory (in bytes)
14
- MEM_LIMIT=4073741824
15
-
16
- POSTGRES_USER=root
17
- POSTGRES_PASSWORD=infiniflow_docgpt
18
- POSTGRES_DB=docgpt
19
-
20
- MINIO_USER=infiniflow
21
- MINIO_PASSWORD=infiniflow_docgpt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docker/docker-compose.yml CHANGED
@@ -1,7 +1,7 @@
1
  version: '2.2'
2
  services:
3
  es01:
4
- container_name: docgpt-es-01
5
  image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
6
  volumes:
7
  - esdata01:/usr/share/elasticsearch/data
@@ -20,14 +20,14 @@ services:
20
  soft: -1
21
  hard: -1
22
  networks:
23
- - docgpt
24
  restart: always
25
 
26
  kibana:
27
  depends_on:
28
  - es01
29
  image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
30
- container_name: docgpt-kibana
31
  volumes:
32
  - kibanadata:/usr/share/kibana/data
33
  ports:
@@ -37,26 +37,39 @@ services:
37
  - ELASTICSEARCH_HOSTS=http://es01:9200
38
  mem_limit: ${MEM_LIMIT}
39
  networks:
40
- - docgpt
41
 
42
- postgres:
43
- image: postgres
44
- container_name: docgpt-postgres
45
  environment:
46
- - POSTGRES_USER=${POSTGRES_USER}
47
- - POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
48
- - POSTGRES_DB=${POSTGRES_DB}
 
 
 
 
 
 
49
  ports:
50
- - 5455:5432
51
  volumes:
52
- - pg_data:/var/lib/postgresql/data
 
53
  networks:
54
- - docgpt
 
 
 
 
 
55
  restart: always
56
 
 
57
  minio:
58
  image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
59
- container_name: docgpt-minio
60
  command: server --console-address ":9001" /data
61
  ports:
62
  - 9000:9000
@@ -67,7 +80,7 @@ services:
67
  volumes:
68
  - minio_data:/data
69
  networks:
70
- - docgpt
71
  restart: always
72
 
73
 
@@ -76,11 +89,11 @@ volumes:
76
  driver: local
77
  kibanadata:
78
  driver: local
79
- pg_data:
80
  driver: local
81
  minio_data:
82
  driver: local
83
 
84
  networks:
85
- docgpt:
86
  driver: bridge
 
1
  version: '2.2'
2
  services:
3
  es01:
4
+ container_name: ragflow-es-01
5
  image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
6
  volumes:
7
  - esdata01:/usr/share/elasticsearch/data
 
20
  soft: -1
21
  hard: -1
22
  networks:
23
+ - ragflow
24
  restart: always
25
 
26
  kibana:
27
  depends_on:
28
  - es01
29
  image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
30
+ container_name: ragflow-kibana
31
  volumes:
32
  - kibanadata:/usr/share/kibana/data
33
  ports:
 
37
  - ELASTICSEARCH_HOSTS=http://es01:9200
38
  mem_limit: ${MEM_LIMIT}
39
  networks:
40
+ - ragflow
41
 
42
+ mysql:
43
+ image: mysql:5.7.18
44
+ container_name: ragflow-mysql
45
  environment:
46
+ - MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD}
47
+ - TZ="Asia/Shanghai"
48
+ command:
49
+ --max_connections=1000
50
+ --character-set-server=utf8mb4
51
+ --collation-server=utf8mb4_general_ci
52
+ --default-authentication-plugin=mysql_native_password
53
+ --tls_version="TLSv1.2,TLSv1.3"
54
+ --init-file /data/application/init.sql
55
  ports:
56
+ - ${MYSQL_PORT}:3306
57
  volumes:
58
+ - mysql_data:/var/lib/mysql
59
+ - ./init.sql:/data/application/init.sql
60
  networks:
61
+ - ragflow
62
+ healthcheck:
63
+ test: [ "CMD-SHELL", "curl --silent localhost:3306 >/dev/null || exit 1" ]
64
+ interval: 10s
65
+ timeout: 10s
66
+ retries: 3
67
  restart: always
68
 
69
+
70
  minio:
71
  image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
72
+ container_name: ragflow-minio
73
  command: server --console-address ":9001" /data
74
  ports:
75
  - 9000:9000
 
80
  volumes:
81
  - minio_data:/data
82
  networks:
83
+ - ragflow
84
  restart: always
85
 
86
 
 
89
  driver: local
90
  kibanadata:
91
  driver: local
92
+ mysql_data:
93
  driver: local
94
  minio_data:
95
  driver: local
96
 
97
  networks:
98
+ ragflow:
99
  driver: bridge
docker/init.sql ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ CREATE DATABASE IF NOT EXISTS rag_flow;
2
+ USE rag_flow;
migration/Cargo.toml DELETED
@@ -1,20 +0,0 @@
1
- [package]
2
- name = "migration"
3
- version = "0.1.0"
4
- edition = "2021"
5
- publish = false
6
-
7
- [lib]
8
- name = "migration"
9
- path = "src/lib.rs"
10
-
11
- [dependencies]
12
- async-std = { version = "1", features = ["attributes", "tokio1"] }
13
- chrono = "0.4.31"
14
-
15
- [dependencies.sea-orm-migration]
16
- version = "0.12.0"
17
- features = [
18
- "runtime-tokio-rustls", # `ASYNC_RUNTIME` feature
19
- "sqlx-postgres", # `DATABASE_DRIVER` feature
20
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
migration/README.md DELETED
@@ -1,41 +0,0 @@
1
- # Running Migrator CLI
2
-
3
- - Generate a new migration file
4
- ```sh
5
- cargo run -- generate MIGRATION_NAME
6
- ```
7
- - Apply all pending migrations
8
- ```sh
9
- cargo run
10
- ```
11
- ```sh
12
- cargo run -- up
13
- ```
14
- - Apply first 10 pending migrations
15
- ```sh
16
- cargo run -- up -n 10
17
- ```
18
- - Rollback last applied migrations
19
- ```sh
20
- cargo run -- down
21
- ```
22
- - Rollback last 10 applied migrations
23
- ```sh
24
- cargo run -- down -n 10
25
- ```
26
- - Drop all tables from the database, then reapply all migrations
27
- ```sh
28
- cargo run -- fresh
29
- ```
30
- - Rollback all applied migrations, then reapply all migrations
31
- ```sh
32
- cargo run -- refresh
33
- ```
34
- - Rollback all applied migrations
35
- ```sh
36
- cargo run -- reset
37
- ```
38
- - Check the status of all migrations
39
- ```sh
40
- cargo run -- status
41
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
migration/src/lib.rs DELETED
@@ -1,12 +0,0 @@
1
- pub use sea_orm_migration::prelude::*;
2
-
3
- mod m20220101_000001_create_table;
4
-
5
- pub struct Migrator;
6
-
7
- #[async_trait::async_trait]
8
- impl MigratorTrait for Migrator {
9
- fn migrations() -> Vec<Box<dyn MigrationTrait>> {
10
- vec![Box::new(m20220101_000001_create_table::Migration)]
11
- }
12
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
migration/src/m20220101_000001_create_table.rs DELETED
@@ -1,440 +0,0 @@
1
- use sea_orm_migration::prelude::*;
2
- use chrono::{ FixedOffset, Utc };
3
-
4
- #[allow(dead_code)]
5
- fn now() -> chrono::DateTime<FixedOffset> {
6
- Utc::now().with_timezone(&FixedOffset::east_opt(3600 * 8).unwrap())
7
- }
8
- #[derive(DeriveMigrationName)]
9
- pub struct Migration;
10
-
11
- #[async_trait::async_trait]
12
- impl MigrationTrait for Migration {
13
- async fn up(&self, manager: &SchemaManager) -> Result<(), DbErr> {
14
- manager.create_table(
15
- Table::create()
16
- .table(UserInfo::Table)
17
- .if_not_exists()
18
- .col(
19
- ColumnDef::new(UserInfo::Uid)
20
- .big_integer()
21
- .not_null()
22
- .auto_increment()
23
- .primary_key()
24
- )
25
- .col(ColumnDef::new(UserInfo::Email).string().not_null())
26
- .col(ColumnDef::new(UserInfo::Nickname).string().not_null())
27
- .col(ColumnDef::new(UserInfo::AvatarBase64).string())
28
- .col(ColumnDef::new(UserInfo::ColorScheme).string().default("dark"))
29
- .col(ColumnDef::new(UserInfo::ListStyle).string().default("list"))
30
- .col(ColumnDef::new(UserInfo::Language).string().default("chinese"))
31
- .col(ColumnDef::new(UserInfo::Password).string().not_null())
32
- .col(
33
- ColumnDef::new(UserInfo::LastLoginAt)
34
- .timestamp_with_time_zone()
35
- .default(Expr::current_timestamp())
36
- )
37
- .col(
38
- ColumnDef::new(UserInfo::CreatedAt)
39
- .timestamp_with_time_zone()
40
- .default(Expr::current_timestamp())
41
- .not_null()
42
- )
43
- .col(
44
- ColumnDef::new(UserInfo::UpdatedAt)
45
- .timestamp_with_time_zone()
46
- .default(Expr::current_timestamp())
47
- .not_null()
48
- )
49
- .col(ColumnDef::new(UserInfo::IsDeleted).boolean().default(false))
50
- .to_owned()
51
- ).await?;
52
-
53
- manager.create_table(
54
- Table::create()
55
- .table(TagInfo::Table)
56
- .if_not_exists()
57
- .col(
58
- ColumnDef::new(TagInfo::Tid)
59
- .big_integer()
60
- .not_null()
61
- .auto_increment()
62
- .primary_key()
63
- )
64
- .col(ColumnDef::new(TagInfo::Uid).big_integer().not_null())
65
- .col(ColumnDef::new(TagInfo::TagName).string().not_null())
66
- .col(ColumnDef::new(TagInfo::Regx).string())
67
- .col(ColumnDef::new(TagInfo::Color).tiny_unsigned().default(1))
68
- .col(ColumnDef::new(TagInfo::Icon).tiny_unsigned().default(1))
69
- .col(ColumnDef::new(TagInfo::FolderId).big_integer())
70
- .col(
71
- ColumnDef::new(TagInfo::CreatedAt)
72
- .timestamp_with_time_zone()
73
- .default(Expr::current_timestamp())
74
- .not_null()
75
- )
76
- .col(
77
- ColumnDef::new(TagInfo::UpdatedAt)
78
- .timestamp_with_time_zone()
79
- .default(Expr::current_timestamp())
80
- .not_null()
81
- )
82
- .col(ColumnDef::new(TagInfo::IsDeleted).boolean().default(false))
83
- .to_owned()
84
- ).await?;
85
-
86
- manager.create_table(
87
- Table::create()
88
- .table(Tag2Doc::Table)
89
- .if_not_exists()
90
- .col(
91
- ColumnDef::new(Tag2Doc::Id)
92
- .big_integer()
93
- .not_null()
94
- .auto_increment()
95
- .primary_key()
96
- )
97
- .col(ColumnDef::new(Tag2Doc::TagId).big_integer())
98
- .col(ColumnDef::new(Tag2Doc::Did).big_integer())
99
- .to_owned()
100
- ).await?;
101
-
102
- manager.create_table(
103
- Table::create()
104
- .table(Kb2Doc::Table)
105
- .if_not_exists()
106
- .col(
107
- ColumnDef::new(Kb2Doc::Id)
108
- .big_integer()
109
- .not_null()
110
- .auto_increment()
111
- .primary_key()
112
- )
113
- .col(ColumnDef::new(Kb2Doc::KbId).big_integer())
114
- .col(ColumnDef::new(Kb2Doc::Did).big_integer())
115
- .col(ColumnDef::new(Kb2Doc::KbProgress).float().default(0))
116
- .col(ColumnDef::new(Kb2Doc::KbProgressMsg).string().default(""))
117
- .col(
118
- ColumnDef::new(Kb2Doc::UpdatedAt)
119
- .timestamp_with_time_zone()
120
- .default(Expr::current_timestamp())
121
- .not_null()
122
- )
123
- .col(ColumnDef::new(Kb2Doc::IsDeleted).boolean().default(false))
124
- .to_owned()
125
- ).await?;
126
-
127
- manager.create_table(
128
- Table::create()
129
- .table(Dialog2Kb::Table)
130
- .if_not_exists()
131
- .col(
132
- ColumnDef::new(Dialog2Kb::Id)
133
- .big_integer()
134
- .not_null()
135
- .auto_increment()
136
- .primary_key()
137
- )
138
- .col(ColumnDef::new(Dialog2Kb::DialogId).big_integer())
139
- .col(ColumnDef::new(Dialog2Kb::KbId).big_integer())
140
- .to_owned()
141
- ).await?;
142
-
143
- manager.create_table(
144
- Table::create()
145
- .table(Doc2Doc::Table)
146
- .if_not_exists()
147
- .col(
148
- ColumnDef::new(Doc2Doc::Id)
149
- .big_integer()
150
- .not_null()
151
- .auto_increment()
152
- .primary_key()
153
- )
154
- .col(ColumnDef::new(Doc2Doc::ParentId).big_integer())
155
- .col(ColumnDef::new(Doc2Doc::Did).big_integer())
156
- .to_owned()
157
- ).await?;
158
-
159
- manager.create_table(
160
- Table::create()
161
- .table(KbInfo::Table)
162
- .if_not_exists()
163
- .col(
164
- ColumnDef::new(KbInfo::KbId)
165
- .big_integer()
166
- .auto_increment()
167
- .not_null()
168
- .primary_key()
169
- )
170
- .col(ColumnDef::new(KbInfo::Uid).big_integer().not_null())
171
- .col(ColumnDef::new(KbInfo::KbName).string().not_null())
172
- .col(ColumnDef::new(KbInfo::Icon).tiny_unsigned().default(1))
173
- .col(
174
- ColumnDef::new(KbInfo::CreatedAt)
175
- .timestamp_with_time_zone()
176
- .default(Expr::current_timestamp())
177
- .not_null()
178
- )
179
- .col(
180
- ColumnDef::new(KbInfo::UpdatedAt)
181
- .timestamp_with_time_zone()
182
- .default(Expr::current_timestamp())
183
- .not_null()
184
- )
185
- .col(ColumnDef::new(KbInfo::IsDeleted).boolean().default(false))
186
- .to_owned()
187
- ).await?;
188
-
189
- manager.create_table(
190
- Table::create()
191
- .table(DocInfo::Table)
192
- .if_not_exists()
193
- .col(
194
- ColumnDef::new(DocInfo::Did)
195
- .big_integer()
196
- .not_null()
197
- .auto_increment()
198
- .primary_key()
199
- )
200
- .col(ColumnDef::new(DocInfo::Uid).big_integer().not_null())
201
- .col(ColumnDef::new(DocInfo::DocName).string().not_null())
202
- .col(ColumnDef::new(DocInfo::Location).string().not_null())
203
- .col(ColumnDef::new(DocInfo::Size).big_integer().not_null())
204
- .col(ColumnDef::new(DocInfo::Type).string().not_null())
205
- .col(ColumnDef::new(DocInfo::ThumbnailBase64).string().default(""))
206
- .comment("doc type|folder")
207
- .col(
208
- ColumnDef::new(DocInfo::CreatedAt)
209
- .timestamp_with_time_zone()
210
- .default(Expr::current_timestamp())
211
- .not_null()
212
- )
213
- .col(
214
- ColumnDef::new(DocInfo::UpdatedAt)
215
- .timestamp_with_time_zone()
216
- .default(Expr::current_timestamp())
217
- .not_null()
218
- )
219
- .col(ColumnDef::new(DocInfo::IsDeleted).boolean().default(false))
220
- .to_owned()
221
- ).await?;
222
-
223
- manager.create_table(
224
- Table::create()
225
- .table(DialogInfo::Table)
226
- .if_not_exists()
227
- .col(
228
- ColumnDef::new(DialogInfo::DialogId)
229
- .big_integer()
230
- .not_null()
231
- .auto_increment()
232
- .primary_key()
233
- )
234
- .col(ColumnDef::new(DialogInfo::Uid).big_integer().not_null())
235
- .col(ColumnDef::new(DialogInfo::KbId).big_integer().not_null())
236
- .col(ColumnDef::new(DialogInfo::DialogName).string().not_null())
237
- .col(ColumnDef::new(DialogInfo::History).string().comment("json"))
238
- .col(
239
- ColumnDef::new(DialogInfo::CreatedAt)
240
- .timestamp_with_time_zone()
241
- .default(Expr::current_timestamp())
242
- .not_null()
243
- )
244
- .col(
245
- ColumnDef::new(DialogInfo::UpdatedAt)
246
- .timestamp_with_time_zone()
247
- .default(Expr::current_timestamp())
248
- .not_null()
249
- )
250
- .col(ColumnDef::new(DialogInfo::IsDeleted).boolean().default(false))
251
- .to_owned()
252
- ).await?;
253
-
254
- let root_insert = Query::insert()
255
- .into_table(UserInfo::Table)
256
- .columns([UserInfo::Email, UserInfo::Nickname, UserInfo::Password])
257
- .values_panic(["[email protected]".into(), "root".into(), "123456".into()])
258
- .to_owned();
259
-
260
- let doc_insert = Query::insert()
261
- .into_table(DocInfo::Table)
262
- .columns([
263
- DocInfo::Uid,
264
- DocInfo::DocName,
265
- DocInfo::Size,
266
- DocInfo::Type,
267
- DocInfo::Location,
268
- ])
269
- .values_panic([(1).into(), "/".into(), (0).into(), "folder".into(), "".into()])
270
- .to_owned();
271
-
272
- let tag_insert = Query::insert()
273
- .into_table(TagInfo::Table)
274
- .columns([TagInfo::Uid, TagInfo::TagName, TagInfo::Regx, TagInfo::Color, TagInfo::Icon])
275
- .values_panic([
276
- (1).into(),
277
- "Video".into(),
278
- ".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)".into(),
279
- (1).into(),
280
- (1).into(),
281
- ])
282
- .values_panic([
283
- (1).into(),
284
- "Picture".into(),
285
- ".*\\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)".into(),
286
- (2).into(),
287
- (2).into(),
288
- ])
289
- .values_panic([
290
- (1).into(),
291
- "Music".into(),
292
- ".*\\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)".into(),
293
- (3).into(),
294
- (3).into(),
295
- ])
296
- .values_panic([
297
- (1).into(),
298
- "Document".into(),
299
- ".*\\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key)".into(),
300
- (3).into(),
301
- (3).into(),
302
- ])
303
- .to_owned();
304
-
305
- manager.exec_stmt(root_insert).await?;
306
- manager.exec_stmt(doc_insert).await?;
307
- manager.exec_stmt(tag_insert).await?;
308
- Ok(())
309
- }
310
-
311
- async fn down(&self, manager: &SchemaManager) -> Result<(), DbErr> {
312
- manager.drop_table(Table::drop().table(UserInfo::Table).to_owned()).await?;
313
-
314
- manager.drop_table(Table::drop().table(TagInfo::Table).to_owned()).await?;
315
-
316
- manager.drop_table(Table::drop().table(Tag2Doc::Table).to_owned()).await?;
317
-
318
- manager.drop_table(Table::drop().table(Kb2Doc::Table).to_owned()).await?;
319
-
320
- manager.drop_table(Table::drop().table(Dialog2Kb::Table).to_owned()).await?;
321
-
322
- manager.drop_table(Table::drop().table(Doc2Doc::Table).to_owned()).await?;
323
-
324
- manager.drop_table(Table::drop().table(KbInfo::Table).to_owned()).await?;
325
-
326
- manager.drop_table(Table::drop().table(DocInfo::Table).to_owned()).await?;
327
-
328
- manager.drop_table(Table::drop().table(DialogInfo::Table).to_owned()).await?;
329
-
330
- Ok(())
331
- }
332
- }
333
-
334
- #[derive(DeriveIden)]
335
- enum UserInfo {
336
- Table,
337
- Uid,
338
- Email,
339
- Nickname,
340
- AvatarBase64,
341
- ColorScheme,
342
- ListStyle,
343
- Language,
344
- Password,
345
- LastLoginAt,
346
- CreatedAt,
347
- UpdatedAt,
348
- IsDeleted,
349
- }
350
-
351
- #[derive(DeriveIden)]
352
- enum TagInfo {
353
- Table,
354
- Tid,
355
- Uid,
356
- TagName,
357
- Regx,
358
- Color,
359
- Icon,
360
- FolderId,
361
- CreatedAt,
362
- UpdatedAt,
363
- IsDeleted,
364
- }
365
-
366
- #[derive(DeriveIden)]
367
- enum Tag2Doc {
368
- Table,
369
- Id,
370
- TagId,
371
- Did,
372
- }
373
-
374
- #[derive(DeriveIden)]
375
- enum Kb2Doc {
376
- Table,
377
- Id,
378
- KbId,
379
- Did,
380
- KbProgress,
381
- KbProgressMsg,
382
- UpdatedAt,
383
- IsDeleted,
384
- }
385
-
386
- #[derive(DeriveIden)]
387
- enum Dialog2Kb {
388
- Table,
389
- Id,
390
- DialogId,
391
- KbId,
392
- }
393
-
394
- #[derive(DeriveIden)]
395
- enum Doc2Doc {
396
- Table,
397
- Id,
398
- ParentId,
399
- Did,
400
- }
401
-
402
- #[derive(DeriveIden)]
403
- enum KbInfo {
404
- Table,
405
- KbId,
406
- Uid,
407
- KbName,
408
- Icon,
409
- CreatedAt,
410
- UpdatedAt,
411
- IsDeleted,
412
- }
413
-
414
- #[derive(DeriveIden)]
415
- enum DocInfo {
416
- Table,
417
- Did,
418
- Uid,
419
- DocName,
420
- Location,
421
- Size,
422
- Type,
423
- ThumbnailBase64,
424
- CreatedAt,
425
- UpdatedAt,
426
- IsDeleted,
427
- }
428
-
429
- #[derive(DeriveIden)]
430
- enum DialogInfo {
431
- Table,
432
- Uid,
433
- KbId,
434
- DialogId,
435
- DialogName,
436
- History,
437
- CreatedAt,
438
- UpdatedAt,
439
- IsDeleted,
440
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
migration/src/main.rs DELETED
@@ -1,6 +0,0 @@
1
- use sea_orm_migration::prelude::*;
2
-
3
- #[async_std::main]
4
- async fn main() {
5
- cli::run_cli(migration::Migrator).await;
6
- }
 
 
 
 
 
 
 
python/Dockerfile ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM ubuntu:22.04 as base
2
+
3
+ RUN apt-get update
4
+
5
+ ENV TZ="Asia/Taipei"
6
+ RUN apt-get install -yq \
7
+ build-essential \
8
+ curl \
9
+ libncursesw5-dev \
10
+ libssl-dev \
11
+ libsqlite3-dev \
12
+ libgdbm-dev \
13
+ libc6-dev \
14
+ libbz2-dev \
15
+ software-properties-common \
16
+ python3.11 python3.11-dev python3-pip
17
+
18
+ RUN apt-get install -yq git
19
+ RUN pip3 config set global.index-url https://mirror.baidu.com/pypi/simple
20
+ RUN pip3 config set global.trusted-host mirror.baidu.com
21
+ RUN pip3 install --upgrade pip
22
+ RUN pip3 install torch==2.0.1
23
+ RUN pip3 install torch-model-archiver==0.8.2
24
+ RUN pip3 install torchvision==0.15.2
25
+ COPY requirements.txt .
26
+
27
+ WORKDIR /docgpt
28
+ ENV PYTHONPATH=/docgpt/
29
+
python/README.md DELETED
@@ -1,22 +0,0 @@
1
-
2
- ```shell
3
-
4
- docker pull postgres
5
-
6
- LOCAL_POSTGRES_DATA=./postgres-data
7
-
8
- docker run
9
- --name docass-postgres
10
- -p 5455:5432
11
- -v $LOCAL_POSTGRES_DATA:/var/lib/postgresql/data
12
- -e POSTGRES_USER=root
13
- -e POSTGRES_PASSWORD=infiniflow_docass
14
- -e POSTGRES_DB=docass
15
- -d
16
- postgres
17
-
18
- docker network create elastic
19
- docker pull elasticsearch:8.11.3;
20
- docker pull docker.elastic.co/kibana/kibana:8.11.3
21
-
22
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
python/{nlp/__init__.py → ToPDF.pdf} RENAMED
File without changes
python/] ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC
2
+ from openai import OpenAI
3
+ import os
4
+ import base64
5
+ from io import BytesIO
6
+
7
+ class Base(ABC):
8
+ def describe(self, image, max_tokens=300):
9
+ raise NotImplementedError("Please implement encode method!")
10
+
11
+
12
+ class GptV4(Base):
13
+ def __init__(self):
14
+ import openapi
15
+ openapi.api_key = os.environ["OPENAPI_KEY"]
16
+ self.client = OpenAI()
17
+
18
+ def describe(self, image, max_tokens=300):
19
+ buffered = BytesIO()
20
+ try:
21
+ image.save(buffered, format="JPEG")
22
+ except Exception as e:
23
+ image.save(buffered, format="PNG")
24
+ b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
25
+
26
+ res = self.client.chat.completions.create(
27
+ model="gpt-4-vision-preview",
28
+ messages=[
29
+ {
30
+ "role": "user",
31
+ "content": [
32
+ {
33
+ "type": "text",
34
+ "text": "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等。",
35
+ },
36
+ {
37
+ "type": "image_url",
38
+ "image_url": {
39
+ "url": f"data:image/jpeg;base64,{b64}"
40
+ },
41
+ },
42
+ ],
43
+ }
44
+ ],
45
+ max_tokens=max_tokens,
46
+ )
47
+ return res.choices[0].message.content.strip()
48
+
49
+
50
+ class QWen(Base):
51
+ def chat(self, system, history, gen_conf):
52
+ from http import HTTPStatus
53
+ from dashscope import Generation
54
+ from dashscope.api_entities.dashscope_response import Role
55
+ # export DASHSCOPE_API_KEY=YOUR_DASHSCOPE_API_KEY
56
+ response = Generation.call(
57
+ Generation.Models.qwen_turbo,
58
+ messages=messages,
59
+ result_format='message'
60
+ )
61
+ if response.status_code == HTTPStatus.OK:
62
+ return response.output.choices[0]['message']['content']
63
+ return response.message
python/conf/logging.json DELETED
@@ -1,41 +0,0 @@
1
- {
2
- "version":1,
3
- "disable_existing_loggers":false,
4
- "formatters":{
5
- "simple":{
6
- "format":"%(asctime)s - %(name)s - %(levelname)s - %(filename)s - %(lineno)d - %(message)s"
7
- }
8
- },
9
- "handlers":{
10
- "console":{
11
- "class":"logging.StreamHandler",
12
- "level":"DEBUG",
13
- "formatter":"simple",
14
- "stream":"ext://sys.stdout"
15
- },
16
- "info_file_handler":{
17
- "class":"logging.handlers.TimedRotatingFileHandler",
18
- "level":"INFO",
19
- "formatter":"simple",
20
- "filename":"log/info.log",
21
- "when": "MIDNIGHT",
22
- "interval":1,
23
- "backupCount":30,
24
- "encoding":"utf8"
25
- },
26
- "error_file_handler":{
27
- "class":"logging.handlers.TimedRotatingFileHandler",
28
- "level":"ERROR",
29
- "formatter":"simple",
30
- "filename":"log/errors.log",
31
- "when": "MIDNIGHT",
32
- "interval":1,
33
- "backupCount":30,
34
- "encoding":"utf8"
35
- }
36
- },
37
- "root":{
38
- "level":"DEBUG",
39
- "handlers":["console","info_file_handler","error_file_handler"]
40
- }
41
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
python/conf/sys.cnf DELETED
@@ -1,9 +0,0 @@
1
- [infiniflow]
2
- es=http://es01:9200
3
- postgres_user=root
4
- postgres_password=infiniflow_docgpt
5
- postgres_host=postgres
6
- postgres_port=5432
7
- minio_host=minio:9000
8
- minio_user=infiniflow
9
- minio_password=infiniflow_docgpt
 
 
 
 
 
 
 
 
 
 
python/llm/__init__.py DELETED
@@ -1,21 +0,0 @@
1
- import os
2
- from .embedding_model import *
3
- from .chat_model import *
4
- from .cv_model import *
5
-
6
- EmbeddingModel = None
7
- ChatModel = None
8
- CvModel = None
9
-
10
-
11
- if os.environ.get("OPENAI_API_KEY"):
12
- EmbeddingModel = GptEmbed()
13
- ChatModel = GptTurbo()
14
- CvModel = GptV4()
15
-
16
- elif os.environ.get("DASHSCOPE_API_KEY"):
17
- EmbeddingModel = QWenEmbd()
18
- ChatModel = QWenChat()
19
- CvModel = QWenCV()
20
- else:
21
- EmbeddingModel = HuEmbedding()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
python/output/ToPDF.pdf ADDED
File without changes
python/requirements.txt DELETED
@@ -1,194 +0,0 @@
1
- accelerate==0.24.1
2
- addict==2.4.0
3
- aiobotocore==2.7.0
4
- aiofiles==23.2.1
5
- aiohttp==3.8.6
6
- aioitertools==0.11.0
7
- aiosignal==1.3.1
8
- aliyun-python-sdk-core==2.14.0
9
- aliyun-python-sdk-kms==2.16.2
10
- altair==5.1.2
11
- anyio==3.7.1
12
- astor==0.8.1
13
- async-timeout==4.0.3
14
- attrdict==2.0.1
15
- attrs==23.1.0
16
- Babel==2.13.1
17
- bce-python-sdk==0.8.92
18
- beautifulsoup4==4.12.2
19
- bitsandbytes==0.41.1
20
- blinker==1.7.0
21
- botocore==1.31.64
22
- cachetools==5.3.2
23
- certifi==2023.7.22
24
- cffi==1.16.0
25
- charset-normalizer==3.3.2
26
- click==8.1.7
27
- cloudpickle==3.0.0
28
- contourpy==1.2.0
29
- crcmod==1.7
30
- cryptography==41.0.5
31
- cssselect==1.2.0
32
- cssutils==2.9.0
33
- cycler==0.12.1
34
- Cython==3.0.5
35
- datasets==2.13.0
36
- datrie==0.8.2
37
- decorator==5.1.1
38
- defusedxml==0.7.1
39
- dill==0.3.6
40
- einops==0.7.0
41
- elastic-transport==8.10.0
42
- elasticsearch==8.10.1
43
- elasticsearch-dsl==8.9.0
44
- et-xmlfile==1.1.0
45
- fastapi==0.104.1
46
- ffmpy==0.3.1
47
- filelock==3.13.1
48
- fire==0.5.0
49
- FlagEmbedding==1.1.5
50
- Flask==3.0.0
51
- flask-babel==4.0.0
52
- fonttools==4.44.0
53
- frozenlist==1.4.0
54
- fsspec==2023.10.0
55
- future==0.18.3
56
- gast==0.5.4
57
- -e
58
- git+https://github.com/ggerganov/llama.cpp.git@5f6e0c0dff1e7a89331e6b25eca9a9fd71324069#egg=gguf&subdirectory=gguf-py
59
- gradio==3.50.2
60
- gradio_client==0.6.1
61
- greenlet==3.0.1
62
- h11==0.14.0
63
- hanziconv==0.3.2
64
- httpcore==1.0.1
65
- httpx==0.25.1
66
- huggingface-hub==0.17.3
67
- idna==3.4
68
- imageio==2.31.6
69
- imgaug==0.4.0
70
- importlib-metadata==6.8.0
71
- importlib-resources==6.1.0
72
- install==1.3.5
73
- itsdangerous==2.1.2
74
- Jinja2==3.1.2
75
- jmespath==0.10.0
76
- joblib==1.3.2
77
- jsonschema==4.19.2
78
- jsonschema-specifications==2023.7.1
79
- kiwisolver==1.4.5
80
- lazy_loader==0.3
81
- lmdb==1.4.1
82
- lxml==4.9.3
83
- MarkupSafe==2.1.3
84
- matplotlib==3.8.1
85
- modelscope==1.9.4
86
- mpmath==1.3.0
87
- multidict==6.0.4
88
- multiprocess==0.70.14
89
- networkx==3.2.1
90
- nltk==3.8.1
91
- numpy==1.24.4
92
- nvidia-cublas-cu12==12.1.3.1
93
- nvidia-cuda-cupti-cu12==12.1.105
94
- nvidia-cuda-nvrtc-cu12==12.1.105
95
- nvidia-cuda-runtime-cu12==12.1.105
96
- nvidia-cudnn-cu12==8.9.2.26
97
- nvidia-cufft-cu12==11.0.2.54
98
- nvidia-curand-cu12==10.3.2.106
99
- nvidia-cusolver-cu12==11.4.5.107
100
- nvidia-cusparse-cu12==12.1.0.106
101
- nvidia-nccl-cu12==2.18.1
102
- nvidia-nvjitlink-cu12==12.3.52
103
- nvidia-nvtx-cu12==12.1.105
104
- opencv-contrib-python==4.6.0.66
105
- opencv-python==4.6.0.66
106
- openpyxl==3.1.2
107
- opt-einsum==3.3.0
108
- orjson==3.9.10
109
- oss2==2.18.3
110
- packaging==23.2
111
- paddleocr==2.7.0.3
112
- paddlepaddle-gpu==2.5.2.post120
113
- pandas==2.1.2
114
- pdf2docx==0.5.5
115
- pdfminer.six==20221105
116
- pdfplumber==0.10.3
117
- Pillow==10.0.1
118
- platformdirs==3.11.0
119
- premailer==3.10.0
120
- protobuf==4.25.0
121
- psutil==5.9.6
122
- pyarrow==14.0.0
123
- pyclipper==1.3.0.post5
124
- pycocotools==2.0.7
125
- pycparser==2.21
126
- pycryptodome==3.19.0
127
- pydantic==1.10.13
128
- pydub==0.25.1
129
- PyMuPDF==1.20.2
130
- pyparsing==3.1.1
131
- pypdfium2==4.23.1
132
- python-dateutil==2.8.2
133
- python-docx==1.1.0
134
- python-multipart==0.0.6
135
- pytz==2023.3.post1
136
- PyYAML==6.0.1
137
- rapidfuzz==3.5.2
138
- rarfile==4.1
139
- referencing==0.30.2
140
- regex==2023.10.3
141
- requests==2.31.0
142
- rpds-py==0.12.0
143
- s3fs==2023.10.0
144
- safetensors==0.4.0
145
- scikit-image==0.22.0
146
- scikit-learn==1.3.2
147
- scipy==1.11.3
148
- semantic-version==2.10.0
149
- sentence-transformers==2.2.2
150
- sentencepiece==0.1.98
151
- shapely==2.0.2
152
- simplejson==3.19.2
153
- six==1.16.0
154
- sniffio==1.3.0
155
- sortedcontainers==2.4.0
156
- soupsieve==2.5
157
- SQLAlchemy==2.0.23
158
- starlette==0.27.0
159
- sympy==1.12
160
- tabulate==0.9.0
161
- tblib==3.0.0
162
- termcolor==2.3.0
163
- threadpoolctl==3.2.0
164
- tifffile==2023.9.26
165
- tiktoken==0.5.1
166
- timm==0.9.10
167
- tokenizers==0.13.3
168
- tomli==2.0.1
169
- toolz==0.12.0
170
- torch==2.1.0
171
- torchaudio==2.1.0
172
- torchvision==0.16.0
173
- tornado==6.3.3
174
- tqdm==4.66.1
175
- transformers==4.33.0
176
- transformers-stream-generator==0.0.4
177
- triton==2.1.0
178
- typing_extensions==4.8.0
179
- tzdata==2023.3
180
- urllib3==2.0.7
181
- uvicorn==0.24.0
182
- uvloop==0.19.0
183
- visualdl==2.5.3
184
- websockets==11.0.3
185
- Werkzeug==3.0.1
186
- wrapt==1.15.0
187
- xgboost==2.0.1
188
- xinference==0.6.0
189
- xorbits==0.7.0
190
- xoscar==0.1.3
191
- xxhash==3.4.1
192
- yapf==0.40.2
193
- yarl==1.9.2
194
- zipp==3.17.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
python/res/1-0.tm ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ 2023-12-20 11:44:08.791336+00:00
2
+ 2023-12-20 11:44:08.853249+00:00
3
+ 2023-12-20 11:44:08.909933+00:00
4
+ 2023-12-21 00:47:09.996757+00:00
5
+ 2023-12-20 11:44:08.965855+00:00
6
+ 2023-12-20 11:44:09.011682+00:00
7
+ 2023-12-21 00:47:10.063326+00:00
8
+ 2023-12-20 11:44:09.069486+00:00
python/res/thumbnail-1-0.tm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 2023-12-27 08:21:49.309802+00:00
2
+ 2023-12-27 08:37:22.407772+00:00
3
+ 2023-12-27 08:59:18.845627+00:00
python/svr/add_thumbnail2file.py DELETED
@@ -1,118 +0,0 @@
1
- import sys, datetime, random, re, cv2
2
- from os.path import dirname, realpath
3
- sys.path.append(dirname(realpath(__file__)) + "/../")
4
- from util.db_conn import Postgres
5
- from util.minio_conn import HuMinio
6
- from util import findMaxDt
7
- import base64
8
- from io import BytesIO
9
- import pandas as pd
10
- from PIL import Image
11
- import pdfplumber
12
-
13
-
14
- PG = Postgres("infiniflow", "docgpt")
15
- MINIO = HuMinio("infiniflow")
16
- def set_thumbnail(did, base64):
17
- sql = f"""
18
- update doc_info set thumbnail_base64='{base64}'
19
- where
20
- did={did}
21
- """
22
- PG.update(sql)
23
-
24
-
25
- def collect(comm, mod, tm):
26
- sql = f"""
27
- select
28
- did, uid, doc_name, location, updated_at
29
- from doc_info
30
- where
31
- updated_at >= '{tm}'
32
- and MOD(did, {comm}) = {mod}
33
- and is_deleted=false
34
- and type <> 'folder'
35
- and thumbnail_base64=''
36
- order by updated_at asc
37
- limit 10
38
- """
39
- docs = PG.select(sql)
40
- if len(docs) == 0:return pd.DataFrame()
41
-
42
- mtm = str(docs["updated_at"].max())[:19]
43
- print("TOTAL:", len(docs), "To: ", mtm)
44
- return docs
45
-
46
-
47
- def build(row):
48
- if not re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|apng|icon|ico|webp|mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$",
49
- row["doc_name"].lower().strip()):
50
- set_thumbnail(row["did"], "_")
51
- return
52
-
53
- def thumbnail(img, SIZE=128):
54
- w,h = img.size
55
- p = SIZE/max(w, h)
56
- w, h = int(w*p), int(h*p)
57
- img.thumbnail((w, h))
58
- buffered = BytesIO()
59
- try:
60
- img.save(buffered, format="JPEG")
61
- except Exception as e:
62
- try:
63
- img.save(buffered, format="PNG")
64
- except Exception as ee:
65
- pass
66
- return base64.b64encode(buffered.getvalue()).decode("utf-8")
67
-
68
-
69
- iobytes = BytesIO(MINIO.get("%s-upload"%str(row["uid"]), row["location"]))
70
- if re.search(r"\.pdf$", row["doc_name"].lower().strip()):
71
- pdf = pdfplumber.open(iobytes)
72
- img = pdf.pages[0].to_image().annotated
73
- set_thumbnail(row["did"], thumbnail(img))
74
-
75
- if re.search(r"\.(jpg|jpeg|png|gif|svg|apng|webp|icon|ico)$", row["doc_name"].lower().strip()):
76
- img = Image.open(iobytes)
77
- set_thumbnail(row["did"], thumbnail(img))
78
-
79
- if re.search(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$", row["doc_name"].lower().strip()):
80
- url = MINIO.get_presigned_url("%s-upload"%str(row["uid"]),
81
- row["location"],
82
- expires=datetime.timedelta(seconds=60)
83
- )
84
- cap = cv2.VideoCapture(url)
85
- succ = cap.isOpened()
86
- i = random.randint(1, 11)
87
- while succ:
88
- ret, frame = cap.read()
89
- if not ret: break
90
- if i > 0:
91
- i -= 1
92
- continue
93
- img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
94
- print(img.size)
95
- set_thumbnail(row["did"], thumbnail(img))
96
- cap.release()
97
- cv2.destroyAllWindows()
98
-
99
-
100
- def main(comm, mod):
101
- global model
102
- tm_fnm = f"res/thumbnail-{comm}-{mod}.tm"
103
- tm = findMaxDt(tm_fnm)
104
- rows = collect(comm, mod, tm)
105
- if len(rows) == 0:return
106
-
107
- tmf = open(tm_fnm, "a+")
108
- for _, r in rows.iterrows():
109
- build(r)
110
- tmf.write(str(r["updated_at"]) + "\n")
111
- tmf.close()
112
-
113
-
114
- if __name__ == "__main__":
115
- from mpi4py import MPI
116
- comm = MPI.COMM_WORLD
117
- main(comm.Get_size(), comm.Get_rank())
118
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
python/svr/dialog_svr.py DELETED
@@ -1,165 +0,0 @@
1
- #-*- coding:utf-8 -*-
2
- import sys, os, re,inspect,json,traceback,logging,argparse, copy
3
- sys.path.append(os.path.realpath(os.path.dirname(inspect.getfile(inspect.currentframe())))+"/../")
4
- from tornado.web import RequestHandler,Application
5
- from tornado.ioloop import IOLoop
6
- from tornado.httpserver import HTTPServer
7
- from tornado.options import define,options
8
- from util import es_conn, setup_logging
9
- from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
10
- from nlp import huqie
11
- from nlp import query as Query
12
- from nlp import search
13
- from llm import HuEmbedding, GptTurbo
14
- import numpy as np
15
- from io import BytesIO
16
- from util import config
17
- from timeit import default_timer as timer
18
- from collections import OrderedDict
19
- from llm import ChatModel, EmbeddingModel
20
-
21
- SE = None
22
- CFIELD="content_ltks"
23
- EMBEDDING = EmbeddingModel
24
- LLM = ChatModel
25
-
26
- def get_QA_pairs(hists):
27
- pa = []
28
- for h in hists:
29
- for k in ["user", "assistant"]:
30
- if h.get(k):
31
- pa.append({
32
- "content": h[k],
33
- "role": k,
34
- })
35
-
36
- for p in pa[:-1]: assert len(p) == 2, p
37
- return pa
38
-
39
-
40
-
41
- def get_instruction(sres, top_i, max_len=8096, fld="content_ltks"):
42
- max_len //= len(top_i)
43
- # add instruction to prompt
44
- instructions = [re.sub(r"[\r\n]+", " ", sres.field[sres.ids[i]][fld]) for i in top_i]
45
- if len(instructions)>2:
46
- # Said that LLM is sensitive to the first and the last one, so
47
- # rearrange the order of references
48
- instructions.append(copy.deepcopy(instructions[1]))
49
- instructions.pop(1)
50
-
51
- def token_num(txt):
52
- c = 0
53
- for tk in re.split(r"[,。/?‘’”“:;:;!!]", txt):
54
- if re.match(r"[a-zA-Z-]+$", tk):
55
- c += 1
56
- continue
57
- c += len(tk)
58
- return c
59
-
60
- _inst = ""
61
- for ins in instructions:
62
- if token_num(_inst) > 4096:
63
- _inst += "\n知识库:" + instructions[-1][:max_len]
64
- break
65
- _inst += "\n知识库:" + ins[:max_len]
66
- return _inst
67
-
68
-
69
- def prompt_and_answer(history, inst):
70
- hist = get_QA_pairs(history)
71
- chks = []
72
- for s in re.split(r"[::;;。\n\r]+", inst):
73
- if s: chks.append(s)
74
- chks = len(set(chks))/(0.1+len(chks))
75
- print("Duplication portion:", chks)
76
-
77
- system = """
78
- 你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答%s。当所有知识库内容都与问题无关时,你的回答必须包括"知识库中未找到您要的答案!这是我所知道的,仅作参考。"这句话。回答需要考虑聊天历史。
79
- 以下是知识库:
80
- %s
81
- 以上是知识库。
82
- """%((",最好总结成表格" if chks<0.6 and chks>0 else ""), inst)
83
-
84
- print("【PROMPT】:", system)
85
- start = timer()
86
- response = LLM.chat(system, hist, {"temperature": 0.2, "max_tokens": 512})
87
- print("GENERATE: ", timer()-start)
88
- print("===>>", response)
89
- return response
90
-
91
-
92
- class Handler(RequestHandler):
93
- def post(self):
94
- global SE,MUST_TK_NUM
95
- param = json.loads(self.request.body.decode('utf-8'))
96
- try:
97
- question = param.get("history",[{"user": "Hi!"}])[-1]["user"]
98
- res = SE.search({
99
- "question": question,
100
- "kb_ids": param.get("kb_ids", []),
101
- "size": param.get("topn", 15)},
102
- search.index_name(param["uid"])
103
- )
104
-
105
- sim = SE.rerank(res, question)
106
- rk_idx = np.argsort(sim*-1)
107
- topidx = [i for i in rk_idx if sim[i] >= aram.get("similarity", 0.5)][:param.get("topn",12)]
108
- inst = get_instruction(res, topidx)
109
-
110
- ans, topidx = prompt_and_answer(param["history"], inst)
111
- ans = SE.insert_citations(ans, topidx, res)
112
-
113
- refer = OrderedDict()
114
- docnms = {}
115
- for i in rk_idx:
116
- did = res.field[res.ids[i]]["doc_id"]
117
- if did not in docnms: docnms[did] = res.field[res.ids[i]]["docnm_kwd"]
118
- if did not in refer: refer[did] = []
119
- refer[did].append({
120
- "chunk_id": res.ids[i],
121
- "content": res.field[res.ids[i]]["content_ltks"],
122
- "image": ""
123
- })
124
-
125
- print("::::::::::::::", ans)
126
- self.write(json.dumps({
127
- "code":0,
128
- "msg":"success",
129
- "data":{
130
- "uid": param["uid"],
131
- "dialog_id": param["dialog_id"],
132
- "assistant": ans,
133
- "refer": [{
134
- "did": did,
135
- "doc_name": docnms[did],
136
- "chunks": chunks
137
- } for did, chunks in refer.items()]
138
- }
139
- }))
140
- logging.info("SUCCESS[%d]"%(res.total)+json.dumps(param, ensure_ascii=False))
141
-
142
- except Exception as e:
143
- logging.error("Request 500: "+str(e))
144
- self.write(json.dumps({
145
- "code":500,
146
- "msg":str(e),
147
- "data":{}
148
- }))
149
- print(traceback.format_exc())
150
-
151
-
152
- if __name__ == '__main__':
153
- parser = argparse.ArgumentParser()
154
- parser.add_argument("--port", default=4455, type=int, help="Port used for service")
155
- ARGS = parser.parse_args()
156
-
157
- SE = search.Dealer(es_conn.HuEs("infiniflow"), EMBEDDING)
158
-
159
- app = Application([(r'/v1/chat/completions', Handler)],debug=False)
160
- http_server = HTTPServer(app)
161
- http_server.bind(ARGS.port)
162
- http_server.start(3)
163
-
164
- IOLoop.current().start()
165
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
python/svr/parse_user_docs.py DELETED
@@ -1,258 +0,0 @@
1
- import json, os, sys, hashlib, copy, time, random, re
2
- from os.path import dirname, realpath
3
- sys.path.append(dirname(realpath(__file__)) + "/../")
4
- from util.es_conn import HuEs
5
- from util.db_conn import Postgres
6
- from util.minio_conn import HuMinio
7
- from util import rmSpace, findMaxDt
8
- from FlagEmbedding import FlagModel
9
- from nlp import huchunk, huqie, search
10
- from io import BytesIO
11
- import pandas as pd
12
- from elasticsearch_dsl import Q
13
- from PIL import Image
14
- from parser import (
15
- PdfParser,
16
- DocxParser,
17
- ExcelParser
18
- )
19
- from nlp.huchunk import (
20
- PdfChunker,
21
- DocxChunker,
22
- ExcelChunker,
23
- PptChunker,
24
- TextChunker
25
- )
26
-
27
- ES = HuEs("infiniflow")
28
- BATCH_SIZE = 64
29
- PG = Postgres("infiniflow", "docgpt")
30
- MINIO = HuMinio("infiniflow")
31
-
32
- PDF = PdfChunker(PdfParser())
33
- DOC = DocxChunker(DocxParser())
34
- EXC = ExcelChunker(ExcelParser())
35
- PPT = PptChunker()
36
-
37
- def chuck_doc(name, binary):
38
- suff = os.path.split(name)[-1].lower().split(".")[-1]
39
- if suff.find("pdf") >= 0: return PDF(binary)
40
- if suff.find("doc") >= 0: return DOC(binary)
41
- if re.match(r"(xlsx|xlsm|xltx|xltm)", suff): return EXC(binary)
42
- if suff.find("ppt") >= 0: return PPT(binary)
43
- if os.envirement.get("PARSE_IMAGE") \
44
- and re.search(r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)$",
45
- name.lower()):
46
- from llm import CvModel
47
- txt = CvModel.describe(binary)
48
- field = TextChunker.Fields()
49
- field.text_chunks = [(txt, binary)]
50
- field.table_chunks = []
51
-
52
-
53
- return TextChunker()(binary)
54
-
55
-
56
- def collect(comm, mod, tm):
57
- sql = f"""
58
- select
59
- id as kb2doc_id,
60
- kb_id,
61
- did,
62
- updated_at,
63
- is_deleted
64
- from kb2_doc
65
- where
66
- updated_at >= '{tm}'
67
- and kb_progress = 0
68
- and MOD(did, {comm}) = {mod}
69
- order by updated_at asc
70
- limit 1000
71
- """
72
- kb2doc = PG.select(sql)
73
- if len(kb2doc) == 0:return pd.DataFrame()
74
-
75
- sql = """
76
- select
77
- did,
78
- uid,
79
- doc_name,
80
- location,
81
- size
82
- from doc_info
83
- where
84
- did in (%s)
85
- """%",".join([str(i) for i in kb2doc["did"].unique()])
86
- docs = PG.select(sql)
87
- docs = docs.fillna("")
88
- docs = docs.join(kb2doc.set_index("did"), on="did", how="left")
89
-
90
- mtm = str(docs["updated_at"].max())[:19]
91
- print("TOTAL:", len(docs), "To: ", mtm)
92
- return docs
93
-
94
-
95
- def set_progress(kb2doc_id, prog, msg="Processing..."):
96
- sql = f"""
97
- update kb2_doc set kb_progress={prog}, kb_progress_msg='{msg}'
98
- where
99
- id={kb2doc_id}
100
- """
101
- PG.update(sql)
102
-
103
-
104
- def build(row):
105
- if row["size"] > 256000000:
106
- set_progress(row["kb2doc_id"], -1, "File size exceeds( <= 256Mb )")
107
- return []
108
- res = ES.search(Q("term", doc_id=row["did"]))
109
- if ES.getTotal(res) > 0:
110
- ES.updateScriptByQuery(Q("term", doc_id=row["did"]),
111
- scripts="""
112
- if(!ctx._source.kb_id.contains('%s'))
113
- ctx._source.kb_id.add('%s');
114
- """%(str(row["kb_id"]), str(row["kb_id"])),
115
- idxnm = search.index_name(row["uid"])
116
- )
117
- set_progress(row["kb2doc_id"], 1, "Done")
118
- return []
119
-
120
- random.seed(time.time())
121
- set_progress(row["kb2doc_id"], random.randint(0, 20)/100., "Finished preparing! Start to slice file!")
122
- try:
123
- obj = chuck_doc(row["doc_name"], MINIO.get("%s-upload"%str(row["uid"]), row["location"]))
124
- except Exception as e:
125
- if re.search("(No such file|not found)", str(e)):
126
- set_progress(row["kb2doc_id"], -1, "Can not find file <%s>"%row["doc_name"])
127
- else:
128
- set_progress(row["kb2doc_id"], -1, f"Internal system error: %s"%str(e).replace("'", ""))
129
- return []
130
-
131
- if not obj.text_chunks and not obj.table_chunks:
132
- set_progress(row["kb2doc_id"], 1, "Nothing added! Mostly, file type unsupported yet.")
133
- return []
134
-
135
- set_progress(row["kb2doc_id"], random.randint(20, 60)/100., "Finished slicing files. Start to embedding the content.")
136
-
137
- doc = {
138
- "doc_id": row["did"],
139
- "kb_id": [str(row["kb_id"])],
140
- "docnm_kwd": os.path.split(row["location"])[-1],
141
- "title_tks": huqie.qie(os.path.split(row["location"])[-1]),
142
- "updated_at": str(row["updated_at"]).replace("T", " ")[:19]
143
- }
144
- doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
145
- output_buffer = BytesIO()
146
- docs = []
147
- md5 = hashlib.md5()
148
- for txt, img in obj.text_chunks:
149
- d = copy.deepcopy(doc)
150
- md5.update((txt + str(d["doc_id"])).encode("utf-8"))
151
- d["_id"] = md5.hexdigest()
152
- d["content_ltks"] = huqie.qie(txt)
153
- d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
154
- if not img:
155
- docs.append(d)
156
- continue
157
-
158
- if isinstance(img, Image): img.save(output_buffer, format='JPEG')
159
- else: output_buffer = BytesIO(img)
160
-
161
- MINIO.put("{}-{}".format(row["uid"], row["kb_id"]), d["_id"],
162
- output_buffer.getvalue())
163
- d["img_id"] = "{}-{}".format(row["uid"], row["kb_id"])
164
- docs.append(d)
165
-
166
- for arr, img in obj.table_chunks:
167
- for i, txt in enumerate(arr):
168
- d = copy.deepcopy(doc)
169
- d["content_ltks"] = huqie.qie(txt)
170
- md5.update((txt + str(d["doc_id"])).encode("utf-8"))
171
- d["_id"] = md5.hexdigest()
172
- if not img:
173
- docs.append(d)
174
- continue
175
- img.save(output_buffer, format='JPEG')
176
- MINIO.put("{}-{}".format(row["uid"], row["kb_id"]), d["_id"],
177
- output_buffer.getvalue())
178
- d["img_id"] = "{}-{}".format(row["uid"], row["kb_id"])
179
- docs.append(d)
180
- set_progress(row["kb2doc_id"], random.randint(60, 70)/100., "Continue embedding the content.")
181
-
182
- return docs
183
-
184
-
185
- def init_kb(row):
186
- idxnm = search.index_name(row["uid"])
187
- if ES.indexExist(idxnm): return
188
- return ES.createIdx(idxnm, json.load(open("conf/mapping.json", "r")))
189
-
190
-
191
- model = None
192
- def embedding(docs):
193
- global model
194
- tts = model.encode([rmSpace(d["title_tks"]) for d in docs])
195
- cnts = model.encode([rmSpace(d["content_ltks"]) for d in docs])
196
- vects = 0.1 * tts + 0.9 * cnts
197
- assert len(vects) == len(docs)
198
- for i,d in enumerate(docs):d["q_vec"] = vects[i].tolist()
199
-
200
-
201
- def rm_doc_from_kb(df):
202
- if len(df) == 0:return
203
- for _,r in df.iterrows():
204
- ES.updateScriptByQuery(Q("term", doc_id=r["did"]),
205
- scripts="""
206
- if(ctx._source.kb_id.contains('%s'))
207
- ctx._source.kb_id.remove(
208
- ctx._source.kb_id.indexOf('%s')
209
- );
210
- """%(str(r["kb_id"]),str(r["kb_id"])),
211
- idxnm = search.index_name(r["uid"])
212
- )
213
- if len(df) == 0:return
214
- sql = """
215
- delete from kb2_doc where id in (%s)
216
- """%",".join([str(i) for i in df["kb2doc_id"]])
217
- PG.update(sql)
218
-
219
-
220
- def main(comm, mod):
221
- global model
222
- from llm import HuEmbedding
223
- model = HuEmbedding()
224
- tm_fnm = f"res/{comm}-{mod}.tm"
225
- tm = findMaxDt(tm_fnm)
226
- rows = collect(comm, mod, tm)
227
- if len(rows) == 0:return
228
-
229
- rm_doc_from_kb(rows.loc[rows.is_deleted == True])
230
- rows = rows.loc[rows.is_deleted == False].reset_index(drop=True)
231
- if len(rows) == 0:return
232
- tmf = open(tm_fnm, "a+")
233
- for _, r in rows.iterrows():
234
- cks = build(r)
235
- if not cks:
236
- tmf.write(str(r["updated_at"]) + "\n")
237
- continue
238
- ## TODO: exception handler
239
- ## set_progress(r["did"], -1, "ERROR: ")
240
- embedding(cks)
241
-
242
- set_progress(r["kb2doc_id"], random.randint(70, 95)/100.,
243
- "Finished embedding! Start to build index!")
244
- init_kb(r)
245
- es_r = ES.bulk(cks, search.index_name(r["uid"]))
246
- if es_r:
247
- set_progress(r["kb2doc_id"], -1, "Index failure!")
248
- print(es_r)
249
- else: set_progress(r["kb2doc_id"], 1., "Done!")
250
- tmf.write(str(r["updated_at"]) + "\n")
251
- tmf.close()
252
-
253
-
254
- if __name__ == "__main__":
255
- from mpi4py import MPI
256
- comm = MPI.COMM_WORLD
257
- main(comm.Get_size(), comm.Get_rank())
258
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
python/tmp.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ----------- Model Configuration -----------
3
+ Model Arch: GFL
4
+ Transform Order:
5
+ --transform op: Resize
6
+ --transform op: NormalizeImage
7
+ --transform op: Permute
8
+ --transform op: PadStride
9
+ --------------------------------------------
10
+ Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
11
+ The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.
12
+ Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
13
+ - This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
14
+ - This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
15
+ WARNING:root:The files are stored in /opt/home/kevinhu/docgpt/, please check it!
python/util/config.py DELETED
@@ -1,31 +0,0 @@
1
- from configparser import ConfigParser
2
- import os
3
- import inspect
4
-
5
- CF = ConfigParser()
6
- __fnm = os.path.join(os.path.dirname(__file__), '../conf/sys.cnf')
7
- if not os.path.exists(__fnm):
8
- __fnm = os.path.join(os.path.dirname(__file__), '../../conf/sys.cnf')
9
- assert os.path.exists(
10
- __fnm), f"【EXCEPTION】can't find {__fnm}." + os.path.dirname(__file__)
11
- if not os.path.exists(__fnm):
12
- __fnm = "./sys.cnf"
13
-
14
- CF.read(__fnm)
15
-
16
-
17
- class Config:
18
- def __init__(self, env):
19
- self.env = env
20
- if env == "spark":
21
- CF.read("./cv.cnf")
22
-
23
- def get(self, key, default=None):
24
- global CF
25
- return os.environ.get(key.upper(),
26
- CF[self.env].get(key, default)
27
- )
28
-
29
-
30
- def init(env):
31
- return Config(env)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
python/util/db_conn.py DELETED
@@ -1,70 +0,0 @@
1
- import logging
2
- import time
3
- from util import config
4
- import pandas as pd
5
-
6
-
7
- class Postgres(object):
8
- def __init__(self, env, dbnm):
9
- self.config = config.init(env)
10
- self.conn = None
11
- self.dbnm = dbnm
12
- self.__open__()
13
-
14
- def __open__(self):
15
- import psycopg2
16
- try:
17
- if self.conn:
18
- self.__close__()
19
- del self.conn
20
- except Exception as e:
21
- pass
22
-
23
- try:
24
- self.conn = psycopg2.connect(f"""dbname={self.dbnm}
25
- user={self.config.get('postgres_user')}
26
- password={self.config.get('postgres_password')}
27
- host={self.config.get('postgres_host')}
28
- port={self.config.get('postgres_port')}""")
29
- except Exception as e:
30
- logging.error(
31
- "Fail to connect %s " %
32
- self.config.get("pgdb_host") + str(e))
33
-
34
- def __close__(self):
35
- try:
36
- self.conn.close()
37
- except Exception as e:
38
- logging.error(
39
- "Fail to close %s " %
40
- self.config.get("pgdb_host") + str(e))
41
-
42
- def select(self, sql):
43
- for _ in range(10):
44
- try:
45
- return pd.read_sql(sql, self.conn)
46
- except Exception as e:
47
- logging.error(f"Fail to exec {sql} " + str(e))
48
- self.__open__()
49
- time.sleep(1)
50
-
51
- return pd.DataFrame()
52
-
53
- def update(self, sql):
54
- for _ in range(10):
55
- try:
56
- cur = self.conn.cursor()
57
- cur.execute(sql)
58
- updated_rows = cur.rowcount
59
- self.conn.commit()
60
- cur.close()
61
- return updated_rows
62
- except Exception as e:
63
- logging.error(f"Fail to exec {sql} " + str(e))
64
- self.__open__()
65
- time.sleep(1)
66
- return 0
67
-
68
-
69
- if __name__ == "__main__":
70
- Postgres("infiniflow", "docgpt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
python/util/setup_logging.py DELETED
@@ -1,36 +0,0 @@
1
- import json
2
- import logging.config
3
- import os
4
-
5
-
6
- def log_dir():
7
- fnm = os.path.join(os.path.dirname(__file__), '../log/')
8
- if not os.path.exists(fnm):
9
- fnm = os.path.join(os.path.dirname(__file__), '../../log/')
10
- assert os.path.exists(fnm), f"Can't locate log dir: {fnm}"
11
- return fnm
12
-
13
-
14
- def setup_logging(default_path="conf/logging.json",
15
- default_level=logging.INFO,
16
- env_key="LOG_CFG"):
17
- path = default_path
18
- value = os.getenv(env_key, None)
19
- if value:
20
- path = value
21
- if os.path.exists(path):
22
- with open(path, "r") as f:
23
- config = json.load(f)
24
- fnm = log_dir()
25
-
26
- config["handlers"]["info_file_handler"]["filename"] = fnm + "info.log"
27
- config["handlers"]["error_file_handler"]["filename"] = fnm + "error.log"
28
- logging.config.dictConfig(config)
29
- else:
30
- logging.basicConfig(level=default_level)
31
-
32
-
33
- __fnm = os.path.join(os.path.dirname(__file__), 'conf/logging.json')
34
- if not os.path.exists(__fnm):
35
- __fnm = os.path.join(os.path.dirname(__file__), '../../conf/logging.json')
36
- setup_logging(__fnm)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
rag/__init__.py ADDED
File without changes
rag/llm/__init__.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright 2019 The FATE Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ from .embedding_model import *
17
+ from .chat_model import *
18
+ from .cv_model import *
19
+
20
+
21
+ EmbeddingModel = {
22
+ "local": HuEmbedding,
23
+ "OpenAI": OpenAIEmbed,
24
+ "通义千问": QWenEmbed,
25
+ }
26
+
27
+
28
+ CvModel = {
29
+ "OpenAI": GptV4,
30
+ "通义千问": QWenCV,
31
+ }
32
+
{python → rag}/llm/chat_model.py RENAMED
@@ -1,3 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from abc import ABC
2
  from openai import OpenAI
3
  import os
 
1
+ #
2
+ # Copyright 2019 The FATE Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
  from abc import ABC
17
  from openai import OpenAI
18
  import os
{python → rag}/llm/cv_model.py RENAMED
@@ -1,3 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from abc import ABC
2
  from openai import OpenAI
3
  import os
@@ -6,6 +21,9 @@ from io import BytesIO
6
 
7
 
8
  class Base(ABC):
 
 
 
9
  def describe(self, image, max_tokens=300):
10
  raise NotImplementedError("Please implement encode method!")
11
 
@@ -40,14 +58,15 @@ class Base(ABC):
40
 
41
 
42
  class GptV4(Base):
43
- def __init__(self):
44
- self.client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
 
45
 
46
  def describe(self, image, max_tokens=300):
47
  b64 = self.image2base64(image)
48
 
49
  res = self.client.chat.completions.create(
50
- model="gpt-4-vision-preview",
51
  messages=self.prompt(b64),
52
  max_tokens=max_tokens,
53
  )
@@ -55,11 +74,15 @@ class GptV4(Base):
55
 
56
 
57
  class QWenCV(Base):
 
 
 
 
 
58
  def describe(self, image, max_tokens=300):
59
  from http import HTTPStatus
60
  from dashscope import MultiModalConversation
61
- # export DASHSCOPE_API_KEY=YOUR_DASHSCOPE_API_KEY
62
- response = MultiModalConversation.call(model=MultiModalConversation.Models.qwen_vl_chat_v1,
63
  messages=self.prompt(self.image2base64(image)))
64
  if response.status_code == HTTPStatus.OK:
65
  return response.output.choices[0]['message']['content']
 
1
+ #
2
+ # Copyright 2019 The FATE Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
  from abc import ABC
17
  from openai import OpenAI
18
  import os
 
21
 
22
 
23
  class Base(ABC):
24
+ def __init__(self, key, model_name):
25
+ pass
26
+
27
  def describe(self, image, max_tokens=300):
28
  raise NotImplementedError("Please implement encode method!")
29
 
 
58
 
59
 
60
  class GptV4(Base):
61
+ def __init__(self, key, model_name="gpt-4-vision-preview"):
62
+ self.client = OpenAI(key)
63
+ self.model_name = model_name
64
 
65
  def describe(self, image, max_tokens=300):
66
  b64 = self.image2base64(image)
67
 
68
  res = self.client.chat.completions.create(
69
+ model=self.model_name,
70
  messages=self.prompt(b64),
71
  max_tokens=max_tokens,
72
  )
 
74
 
75
 
76
  class QWenCV(Base):
77
+ def __init__(self, key, model_name="qwen-vl-chat-v1"):
78
+ import dashscope
79
+ dashscope.api_key = key
80
+ self.model_name = model_name
81
+
82
  def describe(self, image, max_tokens=300):
83
  from http import HTTPStatus
84
  from dashscope import MultiModalConversation
85
+ response = MultiModalConversation.call(model=self.model_name,
 
86
  messages=self.prompt(self.image2base64(image)))
87
  if response.status_code == HTTPStatus.OK:
88
  return response.output.choices[0]['message']['content']
{python → rag}/llm/embedding_model.py RENAMED
@@ -1,12 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from abc import ABC
 
 
2
  from openai import OpenAI
3
  from FlagEmbedding import FlagModel
4
  import torch
5
  import os
6
  import numpy as np
7
 
 
 
8
 
9
  class Base(ABC):
 
 
 
 
10
  def encode(self, texts: list, batch_size=32):
11
  raise NotImplementedError("Please implement encode method!")
12
 
@@ -28,34 +51,44 @@ class HuEmbedding(Base):
28
  query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
29
  use_fp16=torch.cuda.is_available())
30
 
 
31
  def encode(self, texts: list, batch_size=32):
 
 
32
  res = []
33
  for i in range(0, len(texts), batch_size):
34
  res.extend(self.model.encode(texts[i:i + batch_size]).tolist())
35
- return np.array(res)
36
 
37
 
38
- class GptEmbed(Base):
39
- def __init__(self):
40
- self.client = OpenAI(api_key=os.envirement["OPENAI_API_KEY"])
 
41
 
42
  def encode(self, texts: list, batch_size=32):
 
 
43
  res = self.client.embeddings.create(input=texts,
44
- model="text-embedding-ada-002")
45
- return [d["embedding"] for d in res["data"]]
 
46
 
 
 
 
 
47
 
48
- class QWenEmbd(Base):
49
  def encode(self, texts: list, batch_size=32, text_type="document"):
50
- # export DASHSCOPE_API_KEY=YOUR_DASHSCOPE_API_KEY
51
  import dashscope
52
- from http import HTTPStatus
53
  res = []
 
54
  for txt in texts:
55
  resp = dashscope.TextEmbedding.call(
56
- model=dashscope.TextEmbedding.Models.text_embedding_v2,
57
  input=txt[:2048],
58
  text_type=text_type
59
  )
60
  res.append(resp["output"]["embeddings"][0]["embedding"])
61
- return res
 
 
1
+ #
2
+ # Copyright 2019 The FATE Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
  from abc import ABC
17
+
18
+ import dashscope
19
  from openai import OpenAI
20
  from FlagEmbedding import FlagModel
21
  import torch
22
  import os
23
  import numpy as np
24
 
25
+ from rag.utils import num_tokens_from_string
26
+
27
 
28
  class Base(ABC):
29
+ def __init__(self, key, model_name):
30
+ pass
31
+
32
+
33
  def encode(self, texts: list, batch_size=32):
34
  raise NotImplementedError("Please implement encode method!")
35
 
 
51
  query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
52
  use_fp16=torch.cuda.is_available())
53
 
54
+
55
  def encode(self, texts: list, batch_size=32):
56
+ token_count = 0
57
+ for t in texts: token_count += num_tokens_from_string(t)
58
  res = []
59
  for i in range(0, len(texts), batch_size):
60
  res.extend(self.model.encode(texts[i:i + batch_size]).tolist())
61
+ return np.array(res), token_count
62
 
63
 
64
+ class OpenAIEmbed(Base):
65
+ def __init__(self, key, model_name="text-embedding-ada-002"):
66
+ self.client = OpenAI(key)
67
+ self.model_name = model_name
68
 
69
  def encode(self, texts: list, batch_size=32):
70
+ token_count = 0
71
+ for t in texts: token_count += num_tokens_from_string(t)
72
  res = self.client.embeddings.create(input=texts,
73
+ model=self.model_name)
74
+ return [d["embedding"] for d in res["data"]], token_count
75
+
76
 
77
+ class QWenEmbed(Base):
78
+ def __init__(self, key, model_name="text_embedding_v2"):
79
+ dashscope.api_key = key
80
+ self.model_name = model_name
81
 
 
82
  def encode(self, texts: list, batch_size=32, text_type="document"):
 
83
  import dashscope
 
84
  res = []
85
+ token_count = 0
86
  for txt in texts:
87
  resp = dashscope.TextEmbedding.call(
88
+ model=self.model_name,
89
  input=txt[:2048],
90
  text_type=text_type
91
  )
92
  res.append(resp["output"]["embeddings"][0]["embedding"])
93
+ token_count += resp["usage"]["total_tokens"]
94
+ return res, token_count
rag/nlp/__init__.py ADDED
File without changes
{python → rag}/nlp/huchunk.py RENAMED
File without changes
{python → rag}/nlp/huqie.py RENAMED
@@ -9,6 +9,8 @@ import string
9
  import sys
10
  from hanziconv import HanziConv
11
 
 
 
12
 
13
  class Huqie:
14
  def key_(self, line):
@@ -41,14 +43,7 @@ class Huqie:
41
  self.DEBUG = debug
42
  self.DENOMINATOR = 1000000
43
  self.trie_ = datrie.Trie(string.printable)
44
- self.DIR_ = ""
45
- if os.path.exists("../res/huqie.txt"):
46
- self.DIR_ = "../res/huqie"
47
- if os.path.exists("./res/huqie.txt"):
48
- self.DIR_ = "./res/huqie"
49
- if os.path.exists("./huqie.txt"):
50
- self.DIR_ = "./huqie"
51
- assert self.DIR_, f"【Can't find huqie】"
52
 
53
  self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
54
  try:
 
9
  import sys
10
  from hanziconv import HanziConv
11
 
12
+ from web_server.utils.file_utils import get_project_base_directory
13
+
14
 
15
  class Huqie:
16
  def key_(self, line):
 
43
  self.DEBUG = debug
44
  self.DENOMINATOR = 1000000
45
  self.trie_ = datrie.Trie(string.printable)
46
+ self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")
 
 
 
 
 
 
 
47
 
48
  self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
49
  try:
{python → rag}/nlp/query.py RENAMED
@@ -1,12 +1,12 @@
 
 
1
  import json
2
  import re
3
- import sys
4
- import os
5
  import logging
6
  import copy
7
  import math
8
  from elasticsearch_dsl import Q, Search
9
- from nlp import huqie, term_weight, synonym
10
 
11
 
12
  class EsQueryer:
 
1
+ # -*- coding: utf-8 -*-
2
+
3
  import json
4
  import re
 
 
5
  import logging
6
  import copy
7
  import math
8
  from elasticsearch_dsl import Q, Search
9
+ from rag.nlp import huqie, term_weight, synonym
10
 
11
 
12
  class EsQueryer:
{python → rag}/nlp/search.py RENAMED
@@ -1,13 +1,11 @@
 
1
  import re
2
  from elasticsearch_dsl import Q, Search, A
3
  from typing import List, Optional, Tuple, Dict, Union
4
  from dataclasses import dataclass
5
- from util import setup_logging, rmSpace
6
- from nlp import huqie, query
7
- from datetime import datetime
8
- from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
9
  import numpy as np
10
- from copy import deepcopy
11
 
12
 
13
  def index_name(uid): return f"docgpt_{uid}"
 
1
+ # -*- coding: utf-8 -*-
2
  import re
3
  from elasticsearch_dsl import Q, Search, A
4
  from typing import List, Optional, Tuple, Dict, Union
5
  from dataclasses import dataclass
6
+ from rag.utils import rmSpace
7
+ from rag.nlp import huqie, query
 
 
8
  import numpy as np
 
9
 
10
 
11
  def index_name(uid): return f"docgpt_{uid}"
{python → rag}/nlp/synonym.py RENAMED
@@ -1,8 +1,11 @@
1
  import json
 
2
  import time
3
  import logging
4
  import re
5
 
 
 
6
 
7
  class Dealer:
8
  def __init__(self, redis=None):
@@ -10,18 +13,12 @@ class Dealer:
10
  self.lookup_num = 100000000
11
  self.load_tm = time.time() - 1000000
12
  self.dictionary = None
 
13
  try:
14
- self.dictionary = json.load(open("./synonym.json", 'r'))
15
- except Exception as e:
16
- pass
17
- try:
18
- self.dictionary = json.load(open("./res/synonym.json", 'r'))
19
  except Exception as e:
20
- try:
21
- self.dictionary = json.load(open("../res/synonym.json", 'r'))
22
- except Exception as e:
23
- logging.warn("Miss synonym.json")
24
- self.dictionary = {}
25
 
26
  if not redis:
27
  logging.warning(
 
1
  import json
2
+ import os
3
  import time
4
  import logging
5
  import re
6
 
7
+ from web_server.utils.file_utils import get_project_base_directory
8
+
9
 
10
  class Dealer:
11
  def __init__(self, redis=None):
 
13
  self.lookup_num = 100000000
14
  self.load_tm = time.time() - 1000000
15
  self.dictionary = None
16
+ path = os.path.join(get_project_base_directory(), "rag/res", "synonym.json")
17
  try:
18
+ self.dictionary = json.load(open(path, 'r'))
 
 
 
 
19
  except Exception as e:
20
+ logging.warn("Miss synonym.json")
21
+ self.dictionary = {}
 
 
 
22
 
23
  if not redis:
24
  logging.warning(
{python → rag}/nlp/term_weight.py RENAMED
@@ -1,9 +1,11 @@
 
1
  import math
2
  import json
3
  import re
4
  import os
5
  import numpy as np
6
- from nlp import huqie
 
7
 
8
 
9
  class Dealer:
@@ -60,16 +62,14 @@ class Dealer:
60
  return set(res.keys())
61
  return res
62
 
63
- fnm = os.path.join(os.path.dirname(__file__), '../res/')
64
- if not os.path.exists(fnm):
65
- fnm = os.path.join(os.path.dirname(__file__), '../../res/')
66
  self.ne, self.df = {}, {}
67
  try:
68
- self.ne = json.load(open(fnm + "ner.json", "r"))
69
  except Exception as e:
70
  print("[WARNING] Load ner.json FAIL!")
71
  try:
72
- self.df = load_dict(fnm + "term.freq")
73
  except Exception as e:
74
  print("[WARNING] Load term.freq FAIL!")
75
 
 
1
+ # -*- coding: utf-8 -*-
2
  import math
3
  import json
4
  import re
5
  import os
6
  import numpy as np
7
+ from rag.nlp import huqie
8
+ from web_server.utils.file_utils import get_project_base_directory
9
 
10
 
11
  class Dealer:
 
62
  return set(res.keys())
63
  return res
64
 
65
+ fnm = os.path.join(get_project_base_directory(), "res")
 
 
66
  self.ne, self.df = {}, {}
67
  try:
68
+ self.ne = json.load(open(os.path.join(fnm, "ner.json"), "r"))
69
  except Exception as e:
70
  print("[WARNING] Load ner.json FAIL!")
71
  try:
72
+ self.df = load_dict(os.path.join(fnm, "term.freq"))
73
  except Exception as e:
74
  print("[WARNING] Load term.freq FAIL!")
75
 
{python → rag}/parser/__init__.py RENAMED
File without changes
{python → rag}/parser/docx_parser.py RENAMED
@@ -1,8 +1,9 @@
 
1
  from docx import Document
2
  import re
3
  import pandas as pd
4
  from collections import Counter
5
- from nlp import huqie
6
  from io import BytesIO
7
 
8
 
 
1
+ # -*- coding: utf-8 -*-
2
  from docx import Document
3
  import re
4
  import pandas as pd
5
  from collections import Counter
6
+ from rag.nlp import huqie
7
  from io import BytesIO
8
 
9
 
{python → rag}/parser/excel_parser.py RENAMED
@@ -1,3 +1,4 @@
 
1
  from openpyxl import load_workbook
2
  import sys
3
  from io import BytesIO
@@ -12,11 +13,18 @@ class HuExcelParser:
12
  res = []
13
  for sheetname in wb.sheetnames:
14
  ws = wb[sheetname]
15
- lines = []
16
- for r in ws.rows:
17
- lines.append(
18
- "\t".join([str(c.value) if c.value is not None else "" for c in r]))
19
- res.append(f"《{sheetname}》\n" + "\n".join(lines))
 
 
 
 
 
 
 
20
  return res
21
 
22
 
 
1
+ # -*- coding: utf-8 -*-
2
  from openpyxl import load_workbook
3
  import sys
4
  from io import BytesIO
 
13
  res = []
14
  for sheetname in wb.sheetnames:
15
  ws = wb[sheetname]
16
+ rows = list(ws.rows)
17
+ ti = list(rows[0])
18
+ for r in list(rows[1:]):
19
+ l = []
20
+ for i,c in enumerate(r):
21
+ if not c.value:continue
22
+ t = str(ti[i].value) if i < len(ti) else ""
23
+ t += (":" if t else "") + str(c.value)
24
+ l.append(t)
25
+ l = "; ".join(l)
26
+ if sheetname.lower().find("sheet") <0: l += " ——"+sheetname
27
+ res.append(l)
28
  return res
29
 
30
 
{python → rag}/parser/pdf_parser.py RENAMED
@@ -1,3 +1,4 @@
 
1
  import xgboost as xgb
2
  from io import BytesIO
3
  import torch
@@ -6,11 +7,11 @@ import pdfplumber
6
  import logging
7
  from PIL import Image
8
  import numpy as np
9
- from nlp import huqie
10
  from collections import Counter
11
  from copy import deepcopy
12
- from cv.table_recognize import TableTransformer
13
- from cv.ppdetection import PPDet
14
  from huggingface_hub import hf_hub_download
15
  logging.getLogger("pdfminer").setLevel(logging.WARNING)
16
 
 
1
+ # -*- coding: utf-8 -*-
2
  import xgboost as xgb
3
  from io import BytesIO
4
  import torch
 
7
  import logging
8
  from PIL import Image
9
  import numpy as np
10
+ from rag.nlp import huqie
11
  from collections import Counter
12
  from copy import deepcopy
13
+ from rag.cv.table_recognize import TableTransformer
14
+ from rag.cv.ppdetection import PPDet
15
  from huggingface_hub import hf_hub_download
16
  logging.getLogger("pdfminer").setLevel(logging.WARNING)
17
 
{python → rag}/res/huqie.txt RENAMED
File without changes
{python → rag}/res/ner.json RENAMED
File without changes