Spaces:

retopara
/

ragflow

Build error

App Files Files Community

KevinHuSh commited on Jan 15, 2024

Commit

3079197

1 Parent(s): db8cae3

build python version rag-flow (#21)

Browse files

* clean rust version project

* clean rust version project

* build python version rag-flow

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env.template +0 -9
Cargo.toml +0 -42
{python/conf → conf}/mapping.json +0 -0
conf/private.pem +30 -0
conf/public.pem +9 -0
conf/service_conf.yaml +28 -0
docker/.env +0 -21
docker/docker-compose.yml +30 -17
docker/init.sql +2 -0
migration/Cargo.toml +0 -20
migration/README.md +0 -41
migration/src/lib.rs +0 -12
migration/src/m20220101_000001_create_table.rs +0 -440
migration/src/main.rs +0 -6
python/Dockerfile +29 -0
python/README.md +0 -22
python/{nlp/__init__.py → ToPDF.pdf} +0 -0
python/] +63 -0
python/conf/logging.json +0 -41
python/conf/sys.cnf +0 -9
python/llm/__init__.py +0 -21
python/output/ToPDF.pdf +0 -0
python/requirements.txt +0 -194
python/res/1-0.tm +8 -0
python/res/thumbnail-1-0.tm +3 -0
python/svr/add_thumbnail2file.py +0 -118
python/svr/dialog_svr.py +0 -165
python/svr/parse_user_docs.py +0 -258
python/tmp.log +15 -0
python/util/config.py +0 -31
python/util/db_conn.py +0 -70
python/util/setup_logging.py +0 -36
rag/__init__.py +0 -0
rag/llm/__init__.py +32 -0
{python → rag}/llm/chat_model.py +15 -0
{python → rag}/llm/cv_model.py +28 -5
{python → rag}/llm/embedding_model.py +44 -11
rag/nlp/__init__.py +0 -0
{python → rag}/nlp/huchunk.py +0 -0
{python → rag}/nlp/huqie.py +3 -8
{python → rag}/nlp/query.py +3 -3
{python → rag}/nlp/search.py +3 -5
{python → rag}/nlp/synonym.py +7 -10
{python → rag}/nlp/term_weight.py +6 -6
{python → rag}/parser/__init__.py +0 -0
{python → rag}/parser/docx_parser.py +2 -1
{python → rag}/parser/excel_parser.py +13 -5
{python → rag}/parser/pdf_parser.py +4 -3
{python → rag}/res/huqie.txt +0 -0
{python → rag}/res/ner.json +0 -0

.env.template DELETED Viewed

@@ -1,9 +0,0 @@
-# Database
-HOST=127.0.0.1
-PORT=8000
-DATABASE_URL="postgresql://infiniflow:infiniflow@localhost/docgpt"
-# S3 Storage
-MINIO_HOST="127.0.0.1:9000"
-MINIO_USR="infiniflow"
-MINIO_PWD="infiniflow_docgpt"

Cargo.toml DELETED Viewed

@@ -1,42 +0,0 @@
-[package]
-name = "doc_gpt"
-version = "0.1.0"
-edition = "2021"
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-[dependencies]
-actix-web = "4.3.1"
-actix-rt = "2.8.0"
-actix-files = "0.6.2"
-actix-multipart = "0.4"
-actix-session = { version = "0.5" }
-actix-identity = { version = "0.4" }
-actix-web-httpauth = { version = "0.6" }
-actix-ws = "0.2.5"
-uuid = { version = "1.6.1", features = [
-    "v4",
-    "fast-rng",
-    "macro-diagnostics",
-] }
-thiserror = "1.0"
-postgres = "0.19.7"
-sea-orm = { version = "0.12.9", features = ["sqlx-postgres", "runtime-tokio-native-tls", "macros"] }
-serde = { version = "1", features = ["derive"] }
-serde_json = "1.0"
-tracing-subscriber = "0.3.18"
-dotenvy = "0.15.7"
-listenfd = "1.0.1"
-chrono = "0.4.31"
-migration = { path = "./migration" }
-minio = "0.1.0"
-futures-util = "0.3.29"
-actix-multipart-extract = "0.1.5"
-regex = "1.10.2"
-tokio = { version = "1.35.1", features = ["rt", "time", "macros"] }
-[[bin]]
-name = "doc_gpt"
-[workspace]
-members = [".", "migration"]

{python/conf → conf}/mapping.json RENAMED Viewed

File without changes

conf/private.pem ADDED Viewed

	@@ -0,0 +1,30 @@

+-----BEGIN RSA PRIVATE KEY-----
+Proc-Type: 4,ENCRYPTED
+DEK-Info: DES-EDE3-CBC,EFF8327C41E531AD
+7jdPFDAA6fiTzOIU7XGzKuT324JKZEcK5vBRJqBkA5XO6ENN1wLdhh3zQbl1Ejfv
+KMSUIgbtQEJB4bvOzS//okbZa1vCNYuTS/NGcpKUnhqdOmAL3hl/kOtOLLjTZrwo
+3KX8iujLH7wQ64GxArtpUuaFq1k0whN1BB5RGJp3IO/L6pMpSWVRKO+JPUrD1Ujr
+XA/LUKQJaZtXVUVOYPtIwbyqPsh93QBetJnRwwV3gNOwGpcX2jDpyTxDUkLJCPPg
+6Hw0pwlQEd8A11sjxCBbASwLeJO1L0w69QiX9chyOkZ+sfDsVpPt/wf1NexA7Cdj
+9uifJ4JGbby39QD6mInZGtnRzQRdafjuXlBR2I0Qa7fBRu8QsfhmLbWZfWno7j08
+4bAAoqB1vRNfSu8LVJXdEEh/HKuwu11pgRr5eH8WQ3hJg+Y2k7zDHpp1VaHL7/Kn
+S+aN5bhQ4Xt0Ujdi1+rsmNchnF6LWsDezHWJeWUM6X7dJnqIBl8oCyghbghT8Tyw
+aEKWXc2+7FsP5yd0NfG3PFYOLdLgfI43pHTAv5PEQ47w9r1XOwfblKKBUDEzaput
+T3t5wQ6wxdyhRxeO4arCHfe/i+j3fzvhlwgbuwrmrkWGWSS86eMTaoGM8+uUrHv0
+6TbU0tj6DKKUslVk1dCHh9TnmNsXZuLJkceZF38PSKNxhzudU8OTtzhS0tFL91HX
+vo7N+XdiGMs8oOSpjE6RPlhFhVAKGJpXwBj/vXLLcmzesA7ZB2kYtFKMIdsUQpls
+PE/4K5PEX2d8pxA5zxo0HleA1YjW8i5WEcDQThZQzj2sWvg06zSjenVFrbCm9Bro
+hFpAB/3zJHxdRN2MpNpvK35WITy1aDUdX1WdyrlcRtIE5ssFTSoxSj9ibbDZ78+z
+gtbw/MUi6vU6Yz1EjvoYu/bmZAHt9Aagcxw6k58fjO2cEB9njK7xbbiZUSwpJhEe
+U/PxK+SdOU/MmGKeqdgqSfhJkq0vhacvsEjFGRAfivSCHkL0UjhObU+rSJ3g1RMO
+oukAev6TOAwbTKVWjg3/EX+pl/zorAgaPNYFX64TSH4lE3VjeWApITb9Z5C/sVxR
+xW6hU9qyjzWYWY+91y16nkw1l7VQvWHUZwV7QzTScC2BOzDVpeqY1KiYJxgoo6sX
+ZCqR5oh4vToG4W8ZrRyauwUaZJ3r+zhAgm+6n6TJQNwFEl0muji+1nPl32EiFsRs
+qR6CtuhUOVQM4VnILDwFJfuGYRFtKzQgvseLNU4ZqAVqQj8l4ARGAP2P1Au/uUKy
+oGzI7a+b5MvRHuvkxPAclOgXgX/8yyOLaBg+mgaqv9h2JIJD28PzouFl3BajRaVB
+7GWTnROJYhX5SuX/g585SLRKoQUtK0WhdJCjTRfyRJPwfdppgdTbWO99R4G+ir02
+JQdSkZf2vmZRXenPNTEPDOUY6nVN6sUuBjmtOwoUF194ODgpYB6IaHqK08sa1pUh
+1mZyxitHdPbygePTe20XWMZFoK2knAqN0JPPbbNjCqiVV+7oqQAnkDIutspu9t2m
+ny3jefFmNozbblQMghLUrq+x9wOEgvS76Sqvq3DG/2BkLzJF3MNkvw==
+-----END RSA PRIVATE KEY-----

conf/public.pem ADDED Viewed

	@@ -0,0 +1,9 @@

+-----BEGIN PUBLIC KEY-----
+MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEArq9XTUSeYr2+N1h3Afl/
+z8Dse/2yD0ZGrKwx+EEEcdsBLca9Ynmx3nIB5obmLlSfmskLpBo0UACBmB5rEjBp
+2Q2f3AG3Hjd4B+gNCG6BDaawuDlgANIhGnaTLrIqWrrcm4EMzJOnAOI1fgzJRsOO
+UEfaS318Eq9OVO3apEyCCt0lOQK6PuksduOjVxtltDav+guVAA068NrPYmRNabVK
+RNLJpL8w4D44sfth5RvZ3q9t+6RTArpEtc5sh5ChzvqPOzKGMXW83C95TxmXqpbK
+6olN4RevSfVjEAgCydH6HN6OhtOQEcnrU97r9H0iZOWwbw3pVrZiUkuRD1R56Wzs
+2wIDAQAB
+-----END PUBLIC KEY-----

conf/service_conf.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+authentication:
+  client:
+    switch: false
+    http_app_key:
+    http_secret_key:
+  site:
+    switch: false
+permission:
+  switch: false
+  component: false
+  dataset: false
+ragflow:
+  # you must set real ip address, 127.0.0.1 and 0.0.0.0 is not supported
+  host: 127.0.0.1
+  http_port: 9380
+database:
+  name: 'rag_flow'
+  user: 'root'
+  passwd: 'infini_rag_flow'
+  host: '123.60.95.134'
+  port: 5455
+  max_connections: 100
+  stale_timeout: 30
+oauth:
+  github:
+    client_id: 302129228f0d96055bee
+    secret_key: e518e55ccfcdfcae8996afc40f110e9c95f14fc4
+    url: https://github.com/login/oauth/access_token

docker/.env DELETED Viewed

@@ -1,21 +0,0 @@
-# Version of Elastic products
-STACK_VERSION=8.11.3
-# Set the cluster name
-CLUSTER_NAME=docgpt
-# Port to expose Elasticsearch HTTP API to the host
-ES_PORT=9200
-# Port to expose Kibana to the host
-KIBANA_PORT=6601
-# Increase or decrease based on the available host memory (in bytes)
-MEM_LIMIT=4073741824
-POSTGRES_USER=root
-POSTGRES_PASSWORD=infiniflow_docgpt
-POSTGRES_DB=docgpt
-MINIO_USER=infiniflow
-MINIO_PASSWORD=infiniflow_docgpt

docker/docker-compose.yml CHANGED Viewed

@@ -1,7 +1,7 @@
 version: '2.2'
 services:
   es01:
-    container_name: docgpt-es-01
     image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
     volumes:
       - esdata01:/usr/share/elasticsearch/data
@@ -20,14 +20,14 @@ services:
         soft: -1
         hard: -1
     networks:
-      - docgpt
     restart: always
   kibana:
     depends_on:
       - es01
     image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
-    container_name: docgpt-kibana
     volumes:
       - kibanadata:/usr/share/kibana/data
     ports:
@@ -37,26 +37,39 @@ services:
       - ELASTICSEARCH_HOSTS=http://es01:9200
     mem_limit: ${MEM_LIMIT}
     networks:
-      - docgpt
-  postgres:
-    image: postgres
-    container_name: docgpt-postgres
     environment:
-      - POSTGRES_USER=${POSTGRES_USER}
-      - POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
-      - POSTGRES_DB=${POSTGRES_DB}
     ports:
-      - 5455:5432
     volumes:
-      - pg_data:/var/lib/postgresql/data
     networks:
-      - docgpt
     restart: always
   minio:
     image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
-    container_name: docgpt-minio
     command: server --console-address ":9001" /data
     ports:
       - 9000:9000
@@ -67,7 +80,7 @@ services:
     volumes:
       - minio_data:/data
     networks:
-      - docgpt
     restart: always
@@ -76,11 +89,11 @@ volumes:
     driver: local
   kibanadata:
     driver: local
-  pg_data:
     driver: local
   minio_data:
     driver: local
 networks:
-  docgpt:
     driver: bridge

 version: '2.2'
 services:
   es01:
+    container_name: ragflow-es-01
     image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
     volumes:
       - esdata01:/usr/share/elasticsearch/data
         soft: -1
         hard: -1
     networks:
+      - ragflow
     restart: always
   kibana:
     depends_on:
       - es01
     image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
+    container_name: ragflow-kibana
     volumes:
       - kibanadata:/usr/share/kibana/data
     ports:
       - ELASTICSEARCH_HOSTS=http://es01:9200
     mem_limit: ${MEM_LIMIT}
     networks:
+      - ragflow
+  mysql:
+    image: mysql:5.7.18
+    container_name: ragflow-mysql
     environment:
+      - MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD}
+      - TZ="Asia/Shanghai"
+    command:
+      --max_connections=1000
+      --character-set-server=utf8mb4
+      --collation-server=utf8mb4_general_ci
+      --default-authentication-plugin=mysql_native_password
+      --tls_version="TLSv1.2,TLSv1.3"
+      --init-file /data/application/init.sql
     ports:
+      - ${MYSQL_PORT}:3306
     volumes:
+      - mysql_data:/var/lib/mysql
+      - ./init.sql:/data/application/init.sql
     networks:
+      - ragflow
+    healthcheck:
+          test: [ "CMD-SHELL", "curl --silent localhost:3306 >/dev/null || exit 1" ]
+          interval: 10s
+          timeout: 10s
+          retries: 3
     restart: always
   minio:
     image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
+    container_name: ragflow-minio
     command: server --console-address ":9001" /data
     ports:
       - 9000:9000
     volumes:
       - minio_data:/data
     networks:
+      - ragflow
     restart: always
     driver: local
   kibanadata:
     driver: local
+  mysql_data:
     driver: local
   minio_data:
     driver: local
 networks:
+  ragflow:
     driver: bridge

docker/init.sql ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ CREATE DATABASE IF NOT EXISTS rag_flow;
2	+ USE rag_flow;

migration/Cargo.toml DELETED Viewed

@@ -1,20 +0,0 @@
-[package]
-name = "migration"
-version = "0.1.0"
-edition = "2021"
-publish = false
-[lib]
-name = "migration"
-path = "src/lib.rs"
-[dependencies]
-async-std = { version = "1", features = ["attributes", "tokio1"] }
-chrono = "0.4.31"
-[dependencies.sea-orm-migration]
-version = "0.12.0"
-features = [
-   "runtime-tokio-rustls",  # `ASYNC_RUNTIME` feature
-   "sqlx-postgres",         # `DATABASE_DRIVER` feature
-]

migration/README.md DELETED Viewed

@@ -1,41 +0,0 @@
-          # Running Migrator CLI
-- Generate a new migration file
-    ```sh
-    cargo run -- generate MIGRATION_NAME
-    ```
-- Apply all pending migrations
-    ```sh
-    cargo run
-    ```
-    ```sh
-    cargo run -- up
-    ```
-- Apply first 10 pending migrations
-    ```sh
-    cargo run -- up -n 10
-    ```
-- Rollback last applied migrations
-    ```sh
-    cargo run -- down
-    ```
-- Rollback last 10 applied migrations
-    ```sh
-    cargo run -- down -n 10
-    ```
-- Drop all tables from the database, then reapply all migrations
-    ```sh
-    cargo run -- fresh
-    ```
-- Rollback all applied migrations, then reapply all migrations
-    ```sh
-    cargo run -- refresh
-    ```
-- Rollback all applied migrations
-    ```sh
-    cargo run -- reset
-    ```
-- Check the status of all migrations
-    ```sh
-    cargo run -- status
-    ```

migration/src/lib.rs DELETED Viewed

@@ -1,12 +0,0 @@
-pub use sea_orm_migration::prelude::*;
-mod m20220101_000001_create_table;
-pub struct Migrator;
-#[async_trait::async_trait]
-impl MigratorTrait for Migrator {
-    fn migrations() -> Vec<Box<dyn MigrationTrait>> {
-        vec![Box::new(m20220101_000001_create_table::Migration)]
-    }
-}

migration/src/m20220101_000001_create_table.rs DELETED Viewed

@@ -1,440 +0,0 @@
-use sea_orm_migration::prelude::*;
-use chrono::{ FixedOffset, Utc };
-#[allow(dead_code)]
-fn now() -> chrono::DateTime<FixedOffset> {
-    Utc::now().with_timezone(&FixedOffset::east_opt(3600 * 8).unwrap())
-}
-#[derive(DeriveMigrationName)]
-pub struct Migration;
-#[async_trait::async_trait]
-impl MigrationTrait for Migration {
-    async fn up(&self, manager: &SchemaManager) -> Result<(), DbErr> {
-        manager.create_table(
-            Table::create()
-                .table(UserInfo::Table)
-                .if_not_exists()
-                .col(
-                    ColumnDef::new(UserInfo::Uid)
-                        .big_integer()
-                        .not_null()
-                        .auto_increment()
-                        .primary_key()
-                )
-                .col(ColumnDef::new(UserInfo::Email).string().not_null())
-                .col(ColumnDef::new(UserInfo::Nickname).string().not_null())
-                .col(ColumnDef::new(UserInfo::AvatarBase64).string())
-                .col(ColumnDef::new(UserInfo::ColorScheme).string().default("dark"))
-                .col(ColumnDef::new(UserInfo::ListStyle).string().default("list"))
-                .col(ColumnDef::new(UserInfo::Language).string().default("chinese"))
-                .col(ColumnDef::new(UserInfo::Password).string().not_null())
-                .col(
-                    ColumnDef::new(UserInfo::LastLoginAt)
-                        .timestamp_with_time_zone()
-                        .default(Expr::current_timestamp())
-                )
-                .col(
-                    ColumnDef::new(UserInfo::CreatedAt)
-                        .timestamp_with_time_zone()
-                        .default(Expr::current_timestamp())
-                        .not_null()
-                )
-                .col(
-                    ColumnDef::new(UserInfo::UpdatedAt)
-                        .timestamp_with_time_zone()
-                        .default(Expr::current_timestamp())
-                        .not_null()
-                )
-                .col(ColumnDef::new(UserInfo::IsDeleted).boolean().default(false))
-                .to_owned()
-        ).await?;
-        manager.create_table(
-            Table::create()
-                .table(TagInfo::Table)
-                .if_not_exists()
-                .col(
-                    ColumnDef::new(TagInfo::Tid)
-                        .big_integer()
-                        .not_null()
-                        .auto_increment()
-                        .primary_key()
-                )
-                .col(ColumnDef::new(TagInfo::Uid).big_integer().not_null())
-                .col(ColumnDef::new(TagInfo::TagName).string().not_null())
-                .col(ColumnDef::new(TagInfo::Regx).string())
-                .col(ColumnDef::new(TagInfo::Color).tiny_unsigned().default(1))
-                .col(ColumnDef::new(TagInfo::Icon).tiny_unsigned().default(1))
-                .col(ColumnDef::new(TagInfo::FolderId).big_integer())
-                .col(
-                    ColumnDef::new(TagInfo::CreatedAt)
-                        .timestamp_with_time_zone()
-                        .default(Expr::current_timestamp())
-                        .not_null()
-                )
-                .col(
-                    ColumnDef::new(TagInfo::UpdatedAt)
-                        .timestamp_with_time_zone()
-                        .default(Expr::current_timestamp())
-                        .not_null()
-                )
-                .col(ColumnDef::new(TagInfo::IsDeleted).boolean().default(false))
-                .to_owned()
-        ).await?;
-        manager.create_table(
-            Table::create()
-                .table(Tag2Doc::Table)
-                .if_not_exists()
-                .col(
-                    ColumnDef::new(Tag2Doc::Id)
-                        .big_integer()
-                        .not_null()
-                        .auto_increment()
-                        .primary_key()
-                )
-                .col(ColumnDef::new(Tag2Doc::TagId).big_integer())
-                .col(ColumnDef::new(Tag2Doc::Did).big_integer())
-                .to_owned()
-        ).await?;
-        manager.create_table(
-            Table::create()
-                .table(Kb2Doc::Table)
-                .if_not_exists()
-                .col(
-                    ColumnDef::new(Kb2Doc::Id)
-                        .big_integer()
-                        .not_null()
-                        .auto_increment()
-                        .primary_key()
-                )
-                .col(ColumnDef::new(Kb2Doc::KbId).big_integer())
-                .col(ColumnDef::new(Kb2Doc::Did).big_integer())
-                .col(ColumnDef::new(Kb2Doc::KbProgress).float().default(0))
-                .col(ColumnDef::new(Kb2Doc::KbProgressMsg).string().default(""))
-                .col(
-                    ColumnDef::new(Kb2Doc::UpdatedAt)
-                        .timestamp_with_time_zone()
-                        .default(Expr::current_timestamp())
-                        .not_null()
-                )
-                .col(ColumnDef::new(Kb2Doc::IsDeleted).boolean().default(false))
-                .to_owned()
-        ).await?;
-        manager.create_table(
-            Table::create()
-                .table(Dialog2Kb::Table)
-                .if_not_exists()
-                .col(
-                    ColumnDef::new(Dialog2Kb::Id)
-                        .big_integer()
-                        .not_null()
-                        .auto_increment()
-                        .primary_key()
-                )
-                .col(ColumnDef::new(Dialog2Kb::DialogId).big_integer())
-                .col(ColumnDef::new(Dialog2Kb::KbId).big_integer())
-                .to_owned()
-        ).await?;
-        manager.create_table(
-            Table::create()
-                .table(Doc2Doc::Table)
-                .if_not_exists()
-                .col(
-                    ColumnDef::new(Doc2Doc::Id)
-                        .big_integer()
-                        .not_null()
-                        .auto_increment()
-                        .primary_key()
-                )
-                .col(ColumnDef::new(Doc2Doc::ParentId).big_integer())
-                .col(ColumnDef::new(Doc2Doc::Did).big_integer())
-                .to_owned()
-        ).await?;
-        manager.create_table(
-            Table::create()
-                .table(KbInfo::Table)
-                .if_not_exists()
-                .col(
-                    ColumnDef::new(KbInfo::KbId)
-                        .big_integer()
-                        .auto_increment()
-                        .not_null()
-                        .primary_key()
-                )
-                .col(ColumnDef::new(KbInfo::Uid).big_integer().not_null())
-                .col(ColumnDef::new(KbInfo::KbName).string().not_null())
-                .col(ColumnDef::new(KbInfo::Icon).tiny_unsigned().default(1))
-                .col(
-                    ColumnDef::new(KbInfo::CreatedAt)
-                        .timestamp_with_time_zone()
-                        .default(Expr::current_timestamp())
-                        .not_null()
-                )
-                .col(
-                    ColumnDef::new(KbInfo::UpdatedAt)
-                        .timestamp_with_time_zone()
-                        .default(Expr::current_timestamp())
-                        .not_null()
-                )
-                .col(ColumnDef::new(KbInfo::IsDeleted).boolean().default(false))
-                .to_owned()
-        ).await?;
-        manager.create_table(
-            Table::create()
-                .table(DocInfo::Table)
-                .if_not_exists()
-                .col(
-                    ColumnDef::new(DocInfo::Did)
-                        .big_integer()
-                        .not_null()
-                        .auto_increment()
-                        .primary_key()
-                )
-                .col(ColumnDef::new(DocInfo::Uid).big_integer().not_null())
-                .col(ColumnDef::new(DocInfo::DocName).string().not_null())
-                .col(ColumnDef::new(DocInfo::Location).string().not_null())
-                .col(ColumnDef::new(DocInfo::Size).big_integer().not_null())
-                .col(ColumnDef::new(DocInfo::Type).string().not_null())
-                .col(ColumnDef::new(DocInfo::ThumbnailBase64).string().default(""))
-                .comment("doc type|folder")
-                .col(
-                    ColumnDef::new(DocInfo::CreatedAt)
-                        .timestamp_with_time_zone()
-                        .default(Expr::current_timestamp())
-                        .not_null()
-                )
-                .col(
-                    ColumnDef::new(DocInfo::UpdatedAt)
-                        .timestamp_with_time_zone()
-                        .default(Expr::current_timestamp())
-                        .not_null()
-                )
-                .col(ColumnDef::new(DocInfo::IsDeleted).boolean().default(false))
-                .to_owned()
-        ).await?;
-        manager.create_table(
-            Table::create()
-                .table(DialogInfo::Table)
-                .if_not_exists()
-                .col(
-                    ColumnDef::new(DialogInfo::DialogId)
-                        .big_integer()
-                        .not_null()
-                        .auto_increment()
-                        .primary_key()
-                )
-                .col(ColumnDef::new(DialogInfo::Uid).big_integer().not_null())
-                .col(ColumnDef::new(DialogInfo::KbId).big_integer().not_null())
-                .col(ColumnDef::new(DialogInfo::DialogName).string().not_null())
-                .col(ColumnDef::new(DialogInfo::History).string().comment("json"))
-                .col(
-                    ColumnDef::new(DialogInfo::CreatedAt)
-                        .timestamp_with_time_zone()
-                        .default(Expr::current_timestamp())
-                        .not_null()
-                )
-                .col(
-                    ColumnDef::new(DialogInfo::UpdatedAt)
-                        .timestamp_with_time_zone()
-                        .default(Expr::current_timestamp())
-                        .not_null()
-                )
-                .col(ColumnDef::new(DialogInfo::IsDeleted).boolean().default(false))
-                .to_owned()
-        ).await?;
-        let root_insert = Query::insert()
-            .into_table(UserInfo::Table)
-            .columns([UserInfo::Email, UserInfo::Nickname, UserInfo::Password])
-            .values_panic(["[email protected]".into(), "root".into(), "123456".into()])
-            .to_owned();
-        let doc_insert = Query::insert()
-            .into_table(DocInfo::Table)
-            .columns([
-                DocInfo::Uid,
-                DocInfo::DocName,
-                DocInfo::Size,
-                DocInfo::Type,
-                DocInfo::Location,
-            ])
-            .values_panic([(1).into(), "/".into(), (0).into(), "folder".into(), "".into()])
-            .to_owned();
-        let tag_insert = Query::insert()
-            .into_table(TagInfo::Table)
-            .columns([TagInfo::Uid, TagInfo::TagName, TagInfo::Regx, TagInfo::Color, TagInfo::Icon])
-            .values_panic([
-                (1).into(),
-                "Video".into(),
-                ".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)".into(),
-                (1).into(),
-                (1).into(),
-            ])
-            .values_panic([
-                (1).into(),
-                "Picture".into(),
-                ".*\\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)".into(),
-                (2).into(),
-                (2).into(),
-            ])
-            .values_panic([
-                (1).into(),
-                "Music".into(),
-                ".*\\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)".into(),
-                (3).into(),
-                (3).into(),
-            ])
-            .values_panic([
-                (1).into(),
-                "Document".into(),
-                ".*\\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key)".into(),
-                (3).into(),
-                (3).into(),
-            ])
-            .to_owned();
-        manager.exec_stmt(root_insert).await?;
-        manager.exec_stmt(doc_insert).await?;
-        manager.exec_stmt(tag_insert).await?;
-        Ok(())
-    }
-    async fn down(&self, manager: &SchemaManager) -> Result<(), DbErr> {
-        manager.drop_table(Table::drop().table(UserInfo::Table).to_owned()).await?;
-        manager.drop_table(Table::drop().table(TagInfo::Table).to_owned()).await?;
-        manager.drop_table(Table::drop().table(Tag2Doc::Table).to_owned()).await?;
-        manager.drop_table(Table::drop().table(Kb2Doc::Table).to_owned()).await?;
-        manager.drop_table(Table::drop().table(Dialog2Kb::Table).to_owned()).await?;
-        manager.drop_table(Table::drop().table(Doc2Doc::Table).to_owned()).await?;
-        manager.drop_table(Table::drop().table(KbInfo::Table).to_owned()).await?;
-        manager.drop_table(Table::drop().table(DocInfo::Table).to_owned()).await?;
-        manager.drop_table(Table::drop().table(DialogInfo::Table).to_owned()).await?;
-        Ok(())
-    }
-}
-#[derive(DeriveIden)]
-enum UserInfo {
-    Table,
-    Uid,
-    Email,
-    Nickname,
-    AvatarBase64,
-    ColorScheme,
-    ListStyle,
-    Language,
-    Password,
-    LastLoginAt,
-    CreatedAt,
-    UpdatedAt,
-    IsDeleted,
-}
-#[derive(DeriveIden)]
-enum TagInfo {
-    Table,
-    Tid,
-    Uid,
-    TagName,
-    Regx,
-    Color,
-    Icon,
-    FolderId,
-    CreatedAt,
-    UpdatedAt,
-    IsDeleted,
-}
-#[derive(DeriveIden)]
-enum Tag2Doc {
-    Table,
-    Id,
-    TagId,
-    Did,
-}
-#[derive(DeriveIden)]
-enum Kb2Doc {
-    Table,
-    Id,
-    KbId,
-    Did,
-    KbProgress,
-    KbProgressMsg,
-    UpdatedAt,
-    IsDeleted,
-}
-#[derive(DeriveIden)]
-enum Dialog2Kb {
-    Table,
-    Id,
-    DialogId,
-    KbId,
-}
-#[derive(DeriveIden)]
-enum Doc2Doc {
-    Table,
-    Id,
-    ParentId,
-    Did,
-}
-#[derive(DeriveIden)]
-enum KbInfo {
-    Table,
-    KbId,
-    Uid,
-    KbName,
-    Icon,
-    CreatedAt,
-    UpdatedAt,
-    IsDeleted,
-}
-#[derive(DeriveIden)]
-enum DocInfo {
-    Table,
-    Did,
-    Uid,
-    DocName,
-    Location,
-    Size,
-    Type,
-    ThumbnailBase64,
-    CreatedAt,
-    UpdatedAt,
-    IsDeleted,
-}
-#[derive(DeriveIden)]
-enum DialogInfo {
-    Table,
-    Uid,
-    KbId,
-    DialogId,
-    DialogName,
-    History,
-    CreatedAt,
-    UpdatedAt,
-    IsDeleted,
-}

migration/src/main.rs DELETED Viewed

@@ -1,6 +0,0 @@
-use sea_orm_migration::prelude::*;
-#[async_std::main]
-async fn main() {
-    cli::run_cli(migration::Migrator).await;
-}

python/Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+FROM ubuntu:22.04 as base
+RUN apt-get update
+ENV TZ="Asia/Taipei"
+RUN apt-get install -yq  \
+    build-essential \
+    curl \
+    libncursesw5-dev \
+    libssl-dev \
+    libsqlite3-dev \
+    libgdbm-dev \
+    libc6-dev \
+    libbz2-dev \
+    software-properties-common \
+    python3.11 python3.11-dev python3-pip
+RUN apt-get install -yq  git
+RUN pip3 config set global.index-url https://mirror.baidu.com/pypi/simple
+RUN pip3 config set global.trusted-host mirror.baidu.com
+RUN pip3 install --upgrade pip
+RUN pip3 install torch==2.0.1
+RUN pip3 install torch-model-archiver==0.8.2
+RUN pip3 install torchvision==0.15.2
+COPY requirements.txt .
+WORKDIR /docgpt
+ENV PYTHONPATH=/docgpt/

python/README.md DELETED Viewed

@@ -1,22 +0,0 @@
-```shell
-docker pull postgres
-LOCAL_POSTGRES_DATA=./postgres-data
-docker run
-    --name docass-postgres
-    -p 5455:5432
-    -v $LOCAL_POSTGRES_DATA:/var/lib/postgresql/data
-    -e POSTGRES_USER=root
-    -e POSTGRES_PASSWORD=infiniflow_docass
-    -e POSTGRES_DB=docass
-    -d
-    postgres
-docker network create elastic
-docker pull elasticsearch:8.11.3;
-docker pull docker.elastic.co/kibana/kibana:8.11.3
-```

python/{nlp/__init__.py → ToPDF.pdf} RENAMED Viewed

File without changes

python/] ADDED Viewed

	@@ -0,0 +1,63 @@

+from abc import ABC
+from openai import OpenAI
+import os
+import base64
+from io import BytesIO
+class Base(ABC):
+    def describe(self, image, max_tokens=300):
+        raise NotImplementedError("Please implement encode method!")
+class GptV4(Base):
+    def __init__(self):
+        import openapi
+        openapi.api_key = os.environ["OPENAPI_KEY"]
+        self.client = OpenAI()
+    def describe(self, image, max_tokens=300):
+        buffered = BytesIO()
+        try:
+            image.save(buffered, format="JPEG")
+        except Exception as e:
+            image.save(buffered, format="PNG")
+        b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+        res = self.client.chat.completions.create(
+            model="gpt-4-vision-preview",
+            messages=[
+              {
+                "role": "user",
+                "content": [
+                  {
+                    "type": "text",
+                    "text": "请用中文详细描述一下图中的内容，比如时间，地点，人物，事情，人物心情等。",
+                  },
+                  {
+                    "type": "image_url",
+                    "image_url": {
+                      "url": f"data:image/jpeg;base64,{b64}"
+                    },
+                  },
+                ],
+              }
+            ],
+            max_tokens=max_tokens,
+        )
+        return res.choices[0].message.content.strip()
+class QWen(Base):
+    def chat(self, system, history, gen_conf):
+        from http import HTTPStatus
+        from dashscope import Generation
+        from dashscope.api_entities.dashscope_response import Role
+        # export DASHSCOPE_API_KEY=YOUR_DASHSCOPE_API_KEY
+        response = Generation.call(
+                    Generation.Models.qwen_turbo,
+                    messages=messages,
+                    result_format='message'
+        )
+        if response.status_code == HTTPStatus.OK:
+            return response.output.choices[0]['message']['content']
+        return response.message

python/conf/logging.json DELETED Viewed

@@ -1,41 +0,0 @@
-{
-    "version":1,
-    "disable_existing_loggers":false,
-    "formatters":{
-        "simple":{
-            "format":"%(asctime)s - %(name)s - %(levelname)s - %(filename)s - %(lineno)d - %(message)s"
-        }
-    },
-    "handlers":{
-        "console":{
-            "class":"logging.StreamHandler",
-            "level":"DEBUG",
-            "formatter":"simple",
-            "stream":"ext://sys.stdout"
-        },
-        "info_file_handler":{
-            "class":"logging.handlers.TimedRotatingFileHandler",
-            "level":"INFO",
-            "formatter":"simple",
-            "filename":"log/info.log",
-            "when": "MIDNIGHT",
-            "interval":1,
-            "backupCount":30,
-            "encoding":"utf8"
-        },
-        "error_file_handler":{
-            "class":"logging.handlers.TimedRotatingFileHandler",
-            "level":"ERROR",
-            "formatter":"simple",
-            "filename":"log/errors.log",
-            "when": "MIDNIGHT",
-            "interval":1,
-            "backupCount":30,
-            "encoding":"utf8"
-        }
-    },
-    "root":{
-        "level":"DEBUG",
-        "handlers":["console","info_file_handler","error_file_handler"]
-    }
-}

python/conf/sys.cnf DELETED Viewed

@@ -1,9 +0,0 @@
-[infiniflow]
-es=http://es01:9200
-postgres_user=root
-postgres_password=infiniflow_docgpt
-postgres_host=postgres
-postgres_port=5432
-minio_host=minio:9000
-minio_user=infiniflow
-minio_password=infiniflow_docgpt

python/llm/__init__.py DELETED Viewed

@@ -1,21 +0,0 @@
-import os
-from .embedding_model import *
-from .chat_model import *
-from .cv_model import *
-EmbeddingModel = None
-ChatModel = None
-CvModel = None
-if os.environ.get("OPENAI_API_KEY"):
-    EmbeddingModel = GptEmbed()
-    ChatModel = GptTurbo()
-    CvModel = GptV4()
-elif os.environ.get("DASHSCOPE_API_KEY"):
-    EmbeddingModel = QWenEmbd()
-    ChatModel = QWenChat()
-    CvModel = QWenCV()
-else:
-    EmbeddingModel = HuEmbedding()

python/output/ToPDF.pdf ADDED Viewed

File without changes

python/requirements.txt DELETED Viewed

@@ -1,194 +0,0 @@
-accelerate==0.24.1
-addict==2.4.0
-aiobotocore==2.7.0
-aiofiles==23.2.1
-aiohttp==3.8.6
-aioitertools==0.11.0
-aiosignal==1.3.1
-aliyun-python-sdk-core==2.14.0
-aliyun-python-sdk-kms==2.16.2
-altair==5.1.2
-anyio==3.7.1
-astor==0.8.1
-async-timeout==4.0.3
-attrdict==2.0.1
-attrs==23.1.0
-Babel==2.13.1
-bce-python-sdk==0.8.92
-beautifulsoup4==4.12.2
-bitsandbytes==0.41.1
-blinker==1.7.0
-botocore==1.31.64
-cachetools==5.3.2
-certifi==2023.7.22
-cffi==1.16.0
-charset-normalizer==3.3.2
-click==8.1.7
-cloudpickle==3.0.0
-contourpy==1.2.0
-crcmod==1.7
-cryptography==41.0.5
-cssselect==1.2.0
-cssutils==2.9.0
-cycler==0.12.1
-Cython==3.0.5
-datasets==2.13.0
-datrie==0.8.2
-decorator==5.1.1
-defusedxml==0.7.1
-dill==0.3.6
-einops==0.7.0
-elastic-transport==8.10.0
-elasticsearch==8.10.1
-elasticsearch-dsl==8.9.0
-et-xmlfile==1.1.0
-fastapi==0.104.1
-ffmpy==0.3.1
-filelock==3.13.1
-fire==0.5.0
-FlagEmbedding==1.1.5
-Flask==3.0.0
-flask-babel==4.0.0
-fonttools==4.44.0
-frozenlist==1.4.0
-fsspec==2023.10.0
-future==0.18.3
-gast==0.5.4
--e
-git+https://github.com/ggerganov/llama.cpp.git@5f6e0c0dff1e7a89331e6b25eca9a9fd71324069#egg=gguf&subdirectory=gguf-py
-gradio==3.50.2
-gradio_client==0.6.1
-greenlet==3.0.1
-h11==0.14.0
-hanziconv==0.3.2
-httpcore==1.0.1
-httpx==0.25.1
-huggingface-hub==0.17.3
-idna==3.4
-imageio==2.31.6
-imgaug==0.4.0
-importlib-metadata==6.8.0
-importlib-resources==6.1.0
-install==1.3.5
-itsdangerous==2.1.2
-Jinja2==3.1.2
-jmespath==0.10.0
-joblib==1.3.2
-jsonschema==4.19.2
-jsonschema-specifications==2023.7.1
-kiwisolver==1.4.5
-lazy_loader==0.3
-lmdb==1.4.1
-lxml==4.9.3
-MarkupSafe==2.1.3
-matplotlib==3.8.1
-modelscope==1.9.4
-mpmath==1.3.0
-multidict==6.0.4
-multiprocess==0.70.14
-networkx==3.2.1
-nltk==3.8.1
-numpy==1.24.4
-nvidia-cublas-cu12==12.1.3.1
-nvidia-cuda-cupti-cu12==12.1.105
-nvidia-cuda-nvrtc-cu12==12.1.105
-nvidia-cuda-runtime-cu12==12.1.105
-nvidia-cudnn-cu12==8.9.2.26
-nvidia-cufft-cu12==11.0.2.54
-nvidia-curand-cu12==10.3.2.106
-nvidia-cusolver-cu12==11.4.5.107
-nvidia-cusparse-cu12==12.1.0.106
-nvidia-nccl-cu12==2.18.1
-nvidia-nvjitlink-cu12==12.3.52
-nvidia-nvtx-cu12==12.1.105
-opencv-contrib-python==4.6.0.66
-opencv-python==4.6.0.66
-openpyxl==3.1.2
-opt-einsum==3.3.0
-orjson==3.9.10
-oss2==2.18.3
-packaging==23.2
-paddleocr==2.7.0.3
-paddlepaddle-gpu==2.5.2.post120
-pandas==2.1.2
-pdf2docx==0.5.5
-pdfminer.six==20221105
-pdfplumber==0.10.3
-Pillow==10.0.1
-platformdirs==3.11.0
-premailer==3.10.0
-protobuf==4.25.0
-psutil==5.9.6
-pyarrow==14.0.0
-pyclipper==1.3.0.post5
-pycocotools==2.0.7
-pycparser==2.21
-pycryptodome==3.19.0
-pydantic==1.10.13
-pydub==0.25.1
-PyMuPDF==1.20.2
-pyparsing==3.1.1
-pypdfium2==4.23.1
-python-dateutil==2.8.2
-python-docx==1.1.0
-python-multipart==0.0.6
-pytz==2023.3.post1
-PyYAML==6.0.1
-rapidfuzz==3.5.2
-rarfile==4.1
-referencing==0.30.2
-regex==2023.10.3
-requests==2.31.0
-rpds-py==0.12.0
-s3fs==2023.10.0
-safetensors==0.4.0
-scikit-image==0.22.0
-scikit-learn==1.3.2
-scipy==1.11.3
-semantic-version==2.10.0
-sentence-transformers==2.2.2
-sentencepiece==0.1.98
-shapely==2.0.2
-simplejson==3.19.2
-six==1.16.0
-sniffio==1.3.0
-sortedcontainers==2.4.0
-soupsieve==2.5
-SQLAlchemy==2.0.23
-starlette==0.27.0
-sympy==1.12
-tabulate==0.9.0
-tblib==3.0.0
-termcolor==2.3.0
-threadpoolctl==3.2.0
-tifffile==2023.9.26
-tiktoken==0.5.1
-timm==0.9.10
-tokenizers==0.13.3
-tomli==2.0.1
-toolz==0.12.0
-torch==2.1.0
-torchaudio==2.1.0
-torchvision==0.16.0
-tornado==6.3.3
-tqdm==4.66.1
-transformers==4.33.0
-transformers-stream-generator==0.0.4
-triton==2.1.0
-typing_extensions==4.8.0
-tzdata==2023.3
-urllib3==2.0.7
-uvicorn==0.24.0
-uvloop==0.19.0
-visualdl==2.5.3
-websockets==11.0.3
-Werkzeug==3.0.1
-wrapt==1.15.0
-xgboost==2.0.1
-xinference==0.6.0
-xorbits==0.7.0
-xoscar==0.1.3
-xxhash==3.4.1
-yapf==0.40.2
-yarl==1.9.2
-zipp==3.17.0

python/res/1-0.tm ADDED Viewed

	@@ -0,0 +1,8 @@

+2023-12-20 11:44:08.791336+00:00
+2023-12-20 11:44:08.853249+00:00
+2023-12-20 11:44:08.909933+00:00
+2023-12-21 00:47:09.996757+00:00
+2023-12-20 11:44:08.965855+00:00
+2023-12-20 11:44:09.011682+00:00
+2023-12-21 00:47:10.063326+00:00
+2023-12-20 11:44:09.069486+00:00

python/res/thumbnail-1-0.tm ADDED Viewed

	@@ -0,0 +1,3 @@

+2023-12-27 08:21:49.309802+00:00
+2023-12-27 08:37:22.407772+00:00
+2023-12-27 08:59:18.845627+00:00

python/svr/add_thumbnail2file.py DELETED Viewed

@@ -1,118 +0,0 @@
-import sys, datetime, random, re, cv2
-from os.path import dirname, realpath
-sys.path.append(dirname(realpath(__file__)) + "/../")
-from util.db_conn import Postgres
-from util.minio_conn import HuMinio
-from util import findMaxDt
-import base64
-from io import BytesIO
-import pandas as pd
-from PIL import Image
-import pdfplumber
-PG = Postgres("infiniflow", "docgpt")
-MINIO = HuMinio("infiniflow")
-def set_thumbnail(did, base64):
-    sql = f"""
-    update doc_info set thumbnail_base64='{base64}'
-    where
-    did={did}
-    """
-    PG.update(sql)
-def collect(comm, mod, tm):
-    sql = f"""
-    select
-    did, uid, doc_name, location, updated_at
-    from doc_info
-    where
-    updated_at >= '{tm}'
-    and MOD(did, {comm}) = {mod}
-    and is_deleted=false
-    and type <> 'folder'
-    and thumbnail_base64=''
-    order by updated_at asc
-    limit 10
-    """
-    docs = PG.select(sql)
-    if len(docs) == 0:return pd.DataFrame()
-    mtm = str(docs["updated_at"].max())[:19]
-    print("TOTAL:", len(docs), "To: ", mtm)
-    return docs
-def build(row):
-    if not re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|apng|icon|ico|webp|mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$",
-              row["doc_name"].lower().strip()):
-        set_thumbnail(row["did"], "_")
-        return
-    def thumbnail(img, SIZE=128):
-        w,h = img.size
-        p = SIZE/max(w, h)
-        w, h = int(w*p), int(h*p)
-        img.thumbnail((w, h))
-        buffered = BytesIO()
-        try:
-            img.save(buffered, format="JPEG")
-        except Exception as e:
-            try:
-                img.save(buffered, format="PNG")
-            except Exception as ee:
-                pass
-        return base64.b64encode(buffered.getvalue()).decode("utf-8")
-    iobytes = BytesIO(MINIO.get("%s-upload"%str(row["uid"]), row["location"]))
-    if re.search(r"\.pdf$", row["doc_name"].lower().strip()):
-        pdf = pdfplumber.open(iobytes)
-        img = pdf.pages[0].to_image().annotated
-        set_thumbnail(row["did"], thumbnail(img))
-    if re.search(r"\.(jpg|jpeg|png|gif|svg|apng|webp|icon|ico)$", row["doc_name"].lower().strip()):
-        img = Image.open(iobytes)
-        set_thumbnail(row["did"], thumbnail(img))
-    if re.search(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$", row["doc_name"].lower().strip()):
-        url  = MINIO.get_presigned_url("%s-upload"%str(row["uid"]),
-                                       row["location"],
-                                       expires=datetime.timedelta(seconds=60)
-                                      )
-        cap = cv2.VideoCapture(url)
-        succ = cap.isOpened()
-        i = random.randint(1, 11)
-        while succ:
-            ret, frame = cap.read()
-            if not ret: break
-            if i > 0:
-                i -= 1
-                continue
-            img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-            print(img.size)
-            set_thumbnail(row["did"], thumbnail(img))
-        cap.release()
-        cv2.destroyAllWindows()
-def main(comm, mod):
-    global model
-    tm_fnm = f"res/thumbnail-{comm}-{mod}.tm"
-    tm = findMaxDt(tm_fnm)
-    rows = collect(comm, mod, tm)
-    if len(rows) == 0:return
-    tmf = open(tm_fnm, "a+")
-    for _, r in rows.iterrows():
-        build(r)
-        tmf.write(str(r["updated_at"]) + "\n")
-    tmf.close()
-if __name__ == "__main__":
-    from mpi4py import MPI
-    comm = MPI.COMM_WORLD
-    main(comm.Get_size(), comm.Get_rank())

python/svr/dialog_svr.py DELETED Viewed

@@ -1,165 +0,0 @@
-#-*- coding:utf-8 -*-
-import sys, os, re,inspect,json,traceback,logging,argparse, copy
-sys.path.append(os.path.realpath(os.path.dirname(inspect.getfile(inspect.currentframe())))+"/../")
-from tornado.web import RequestHandler,Application
-from tornado.ioloop import IOLoop
-from tornado.httpserver import HTTPServer
-from tornado.options import define,options
-from util import es_conn, setup_logging
-from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
-from nlp import huqie
-from nlp import query as Query
-from nlp import search
-from llm import HuEmbedding, GptTurbo
-import numpy as np
-from io import BytesIO
-from util import config
-from timeit import default_timer as timer
-from collections import OrderedDict
-from llm import ChatModel, EmbeddingModel
-SE = None
-CFIELD="content_ltks"
-EMBEDDING = EmbeddingModel
-LLM = ChatModel
-def get_QA_pairs(hists):
-    pa = []
-    for h in hists:
-        for k in ["user", "assistant"]:
-            if h.get(k):
-                pa.append({
-                    "content": h[k],
-                    "role": k,
-                })
-    for p in pa[:-1]: assert len(p) == 2, p
-    return pa
-def get_instruction(sres, top_i, max_len=8096, fld="content_ltks"):
-    max_len //= len(top_i)
-    # add instruction to prompt
-    instructions = [re.sub(r"[\r\n]+", " ", sres.field[sres.ids[i]][fld]) for i in top_i]
-    if len(instructions)>2:
-        # Said that LLM is sensitive to the first and the last one, so
-        # rearrange the order of references
-        instructions.append(copy.deepcopy(instructions[1]))
-        instructions.pop(1)
-    def token_num(txt):
-        c = 0
-        for tk in re.split(r"[，。/？‘’”“：；:;!！]", txt):
-            if re.match(r"[a-zA-Z-]+$", tk):
-                c += 1
-                continue
-            c += len(tk)
-        return c
-    _inst = ""
-    for ins in instructions:
-        if token_num(_inst) > 4096:
-            _inst += "\n知识库：" + instructions[-1][:max_len]
-            break
-        _inst += "\n知识库：" + ins[:max_len]
-    return _inst
-def prompt_and_answer(history, inst):
-    hist = get_QA_pairs(history)
-    chks = []
-    for s in re.split(r"[：:;；。\n\r]+", inst):
-        if s: chks.append(s)
-    chks = len(set(chks))/(0.1+len(chks))
-    print("Duplication portion:", chks)
-    system = """
-你是一个智能助手，请总结知识库的内容来回答问题，请列举知识库中的数据详细回答%s。当所有知识库内容都与问题无关时，你的回答必须包括"知识库中未找到您要的答案！这是我所知道的，仅作参考。"这句话。回答需要考虑聊天历史。
-以下是知识库：
-%s
-以上是知识库。
-"""%(("，最好总结成表格" if chks<0.6 and chks>0 else ""), inst)
-    print("【PROMPT】:", system)
-    start = timer()
-    response = LLM.chat(system, hist, {"temperature": 0.2, "max_tokens": 512})
-    print("GENERATE: ", timer()-start)
-    print("===>>", response)
-    return response
-class Handler(RequestHandler):
-    def post(self):
-        global SE,MUST_TK_NUM
-        param = json.loads(self.request.body.decode('utf-8'))
-        try:
-            question = param.get("history",[{"user": "Hi!"}])[-1]["user"]
-            res = SE.search({
-                    "question": question,
-                    "kb_ids": param.get("kb_ids", []),
-                    "size": param.get("topn", 15)},
-               search.index_name(param["uid"])
-            )
-            sim = SE.rerank(res, question)
-            rk_idx = np.argsort(sim*-1)
-            topidx = [i for i in rk_idx if sim[i] >= aram.get("similarity", 0.5)][:param.get("topn",12)]
-            inst = get_instruction(res, topidx)
-            ans, topidx = prompt_and_answer(param["history"], inst)
-            ans = SE.insert_citations(ans, topidx, res)
-            refer = OrderedDict()
-            docnms = {}
-            for i in rk_idx:
-                 did = res.field[res.ids[i]]["doc_id"]
-                 if did not in docnms: docnms[did] = res.field[res.ids[i]]["docnm_kwd"]
-                 if did not in refer: refer[did] = []
-                 refer[did].append({
-                     "chunk_id": res.ids[i],
-                     "content": res.field[res.ids[i]]["content_ltks"],
-                     "image": ""
-                 })
-            print("::::::::::::::", ans)
-            self.write(json.dumps({
-                "code":0,
-                "msg":"success",
-                "data":{
-                    "uid": param["uid"],
-                    "dialog_id": param["dialog_id"],
-                    "assistant": ans,
-                    "refer": [{
-                        "did": did,
-                        "doc_name": docnms[did],
-                        "chunks": chunks
-                    } for did, chunks in refer.items()]
-                }
-            }))
-            logging.info("SUCCESS[%d]"%(res.total)+json.dumps(param, ensure_ascii=False))
-        except Exception as e:
-            logging.error("Request 500: "+str(e))
-            self.write(json.dumps({
-                "code":500,
-                "msg":str(e),
-                "data":{}
-            }))
-            print(traceback.format_exc())
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--port", default=4455, type=int, help="Port used for service")
-    ARGS = parser.parse_args()
-    SE = search.Dealer(es_conn.HuEs("infiniflow"), EMBEDDING)
-    app = Application([(r'/v1/chat/completions', Handler)],debug=False)
-    http_server = HTTPServer(app)
-    http_server.bind(ARGS.port)
-    http_server.start(3)
-    IOLoop.current().start()

python/svr/parse_user_docs.py DELETED Viewed

@@ -1,258 +0,0 @@
-import json, os, sys, hashlib, copy, time, random, re
-from os.path import dirname, realpath
-sys.path.append(dirname(realpath(__file__)) + "/../")
-from util.es_conn import HuEs
-from util.db_conn import Postgres
-from util.minio_conn import HuMinio
-from util import rmSpace, findMaxDt
-from FlagEmbedding import FlagModel
-from nlp import huchunk, huqie, search
-from io import BytesIO
-import pandas as pd
-from elasticsearch_dsl import Q
-from PIL import Image
-from parser import (
-    PdfParser,
-    DocxParser,
-    ExcelParser
-)
-from nlp.huchunk import (
-    PdfChunker,
-    DocxChunker,
-    ExcelChunker,
-    PptChunker,
-    TextChunker
-)
-ES = HuEs("infiniflow")
-BATCH_SIZE = 64
-PG = Postgres("infiniflow", "docgpt")
-MINIO = HuMinio("infiniflow")
-PDF = PdfChunker(PdfParser())
-DOC = DocxChunker(DocxParser())
-EXC = ExcelChunker(ExcelParser())
-PPT = PptChunker()
-def chuck_doc(name, binary):
-    suff = os.path.split(name)[-1].lower().split(".")[-1]
-    if suff.find("pdf") >= 0: return PDF(binary)
-    if suff.find("doc") >= 0: return DOC(binary)
-    if re.match(r"(xlsx|xlsm|xltx|xltm)", suff): return EXC(binary)
-    if suff.find("ppt") >= 0: return PPT(binary)
-    if os.envirement.get("PARSE_IMAGE") \
-       and re.search(r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)$",
-              name.lower()):
-        from llm import CvModel
-        txt = CvModel.describe(binary)
-        field = TextChunker.Fields()
-        field.text_chunks = [(txt, binary)]
-        field.table_chunks = []
-    return TextChunker()(binary)
-def collect(comm, mod, tm):
-    sql = f"""
-    select
-    id as kb2doc_id,
-    kb_id,
-    did,
-    updated_at,
-    is_deleted
-    from kb2_doc
-    where
-    updated_at >= '{tm}'
-    and kb_progress = 0
-    and MOD(did, {comm}) = {mod}
-    order by updated_at asc
-    limit 1000
-    """
-    kb2doc = PG.select(sql)
-    if len(kb2doc) == 0:return pd.DataFrame()
-    sql = """
-    select
-    did,
-    uid,
-    doc_name,
-    location,
-    size
-    from doc_info
-    where
-    did in (%s)
-    """%",".join([str(i) for i in kb2doc["did"].unique()])
-    docs = PG.select(sql)
-    docs = docs.fillna("")
-    docs = docs.join(kb2doc.set_index("did"), on="did", how="left")
-    mtm = str(docs["updated_at"].max())[:19]
-    print("TOTAL:", len(docs), "To: ", mtm)
-    return docs
-def set_progress(kb2doc_id, prog, msg="Processing..."):
-    sql = f"""
-    update kb2_doc set kb_progress={prog}, kb_progress_msg='{msg}'
-    where
-    id={kb2doc_id}
-    """
-    PG.update(sql)
-def build(row):
-    if row["size"] > 256000000:
-        set_progress(row["kb2doc_id"], -1, "File size exceeds( <= 256Mb )")
-        return  []
-    res = ES.search(Q("term", doc_id=row["did"]))
-    if ES.getTotal(res) > 0:
-        ES.updateScriptByQuery(Q("term", doc_id=row["did"]),
-                               scripts="""
-                               if(!ctx._source.kb_id.contains('%s'))
-                                 ctx._source.kb_id.add('%s');
-                               """%(str(row["kb_id"]), str(row["kb_id"])),
-                               idxnm = search.index_name(row["uid"])
-                              )
-        set_progress(row["kb2doc_id"], 1, "Done")
-        return []
-    random.seed(time.time())
-    set_progress(row["kb2doc_id"], random.randint(0, 20)/100., "Finished preparing! Start to slice file!")
-    try:
-        obj = chuck_doc(row["doc_name"], MINIO.get("%s-upload"%str(row["uid"]), row["location"]))
-    except Exception as e:
-        if re.search("(No such file|not found)", str(e)):
-            set_progress(row["kb2doc_id"], -1, "Can not find file <%s>"%row["doc_name"])
-        else:
-            set_progress(row["kb2doc_id"], -1, f"Internal system error: %s"%str(e).replace("'", ""))
-        return []
-    if not obj.text_chunks and not obj.table_chunks:
-        set_progress(row["kb2doc_id"], 1, "Nothing added! Mostly, file type unsupported yet.")
-        return  []
-    set_progress(row["kb2doc_id"], random.randint(20, 60)/100., "Finished slicing files. Start to embedding the content.")
-    doc = {
-        "doc_id": row["did"],
-        "kb_id": [str(row["kb_id"])],
-        "docnm_kwd": os.path.split(row["location"])[-1],
-        "title_tks": huqie.qie(os.path.split(row["location"])[-1]),
-        "updated_at": str(row["updated_at"]).replace("T", " ")[:19]
-    }
-    doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
-    output_buffer = BytesIO()
-    docs = []
-    md5 = hashlib.md5()
-    for txt, img in obj.text_chunks:
-        d = copy.deepcopy(doc)
-        md5.update((txt + str(d["doc_id"])).encode("utf-8"))
-        d["_id"] = md5.hexdigest()
-        d["content_ltks"] = huqie.qie(txt)
-        d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
-        if not img:
-            docs.append(d)
-            continue
-        if isinstance(img, Image): img.save(output_buffer, format='JPEG')
-        else: output_buffer = BytesIO(img)
-        MINIO.put("{}-{}".format(row["uid"], row["kb_id"]), d["_id"],
-                      output_buffer.getvalue())
-        d["img_id"] = "{}-{}".format(row["uid"], row["kb_id"])
-        docs.append(d)
-    for arr, img in obj.table_chunks:
-        for i, txt in enumerate(arr):
-            d = copy.deepcopy(doc)
-            d["content_ltks"] = huqie.qie(txt)
-            md5.update((txt + str(d["doc_id"])).encode("utf-8"))
-            d["_id"] = md5.hexdigest()
-            if not img:
-                docs.append(d)
-                continue
-            img.save(output_buffer, format='JPEG')
-            MINIO.put("{}-{}".format(row["uid"], row["kb_id"]), d["_id"],
-                      output_buffer.getvalue())
-            d["img_id"] = "{}-{}".format(row["uid"], row["kb_id"])
-            docs.append(d)
-    set_progress(row["kb2doc_id"], random.randint(60, 70)/100., "Continue embedding the content.")
-    return docs
-def init_kb(row):
-    idxnm = search.index_name(row["uid"])
-    if ES.indexExist(idxnm): return
-    return ES.createIdx(idxnm, json.load(open("conf/mapping.json", "r")))
-model = None
-def embedding(docs):
-    global model
-    tts = model.encode([rmSpace(d["title_tks"]) for d in docs])
-    cnts = model.encode([rmSpace(d["content_ltks"]) for d in docs])
-    vects = 0.1 * tts + 0.9 * cnts
-    assert len(vects) == len(docs)
-    for i,d in enumerate(docs):d["q_vec"] = vects[i].tolist()
-def rm_doc_from_kb(df):
-    if len(df) == 0:return
-    for _,r in df.iterrows():
-        ES.updateScriptByQuery(Q("term", doc_id=r["did"]),
-                               scripts="""
-                               if(ctx._source.kb_id.contains('%s'))
-                                 ctx._source.kb_id.remove(
-                                     ctx._source.kb_id.indexOf('%s')
-                               );
-                                """%(str(r["kb_id"]),str(r["kb_id"])),
-                               idxnm = search.index_name(r["uid"])
-                              )
-    if len(df) == 0:return
-    sql = """
-    delete from kb2_doc where id in (%s)
-    """%",".join([str(i) for i in df["kb2doc_id"]])
-    PG.update(sql)
-def main(comm, mod):
-    global model
-    from llm import HuEmbedding
-    model = HuEmbedding()
-    tm_fnm = f"res/{comm}-{mod}.tm"
-    tm = findMaxDt(tm_fnm)
-    rows = collect(comm, mod, tm)
-    if len(rows) == 0:return
-    rm_doc_from_kb(rows.loc[rows.is_deleted == True])
-    rows = rows.loc[rows.is_deleted == False].reset_index(drop=True)
-    if len(rows) == 0:return
-    tmf = open(tm_fnm, "a+")
-    for _, r in rows.iterrows():
-        cks = build(r)
-        if not cks:
-            tmf.write(str(r["updated_at"]) + "\n")
-            continue
-        ## TODO: exception handler
-        ## set_progress(r["did"], -1, "ERROR: ")
-        embedding(cks)
-        set_progress(r["kb2doc_id"], random.randint(70, 95)/100.,
-                     "Finished embedding! Start to build index!")
-        init_kb(r)
-        es_r = ES.bulk(cks, search.index_name(r["uid"]))
-        if es_r:
-            set_progress(r["kb2doc_id"], -1, "Index failure!")
-            print(es_r)
-        else: set_progress(r["kb2doc_id"], 1., "Done!")
-        tmf.write(str(r["updated_at"]) + "\n")
-    tmf.close()
-if __name__ == "__main__":
-    from mpi4py import MPI
-    comm = MPI.COMM_WORLD
-    main(comm.Get_size(), comm.Get_rank())

python/tmp.log ADDED Viewed

	@@ -0,0 +1,15 @@

+-----------  Model Configuration -----------
+Model Arch: GFL
+Transform Order:
+--transform op: Resize
+--transform op: NormalizeImage
+--transform op: Permute
+--transform op: PadStride
+--------------------------------------------
+Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
+The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.
+Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
+- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+WARNING:root:The files are stored in /opt/home/kevinhu/docgpt/, please check it!

python/util/config.py DELETED Viewed

@@ -1,31 +0,0 @@
-from configparser import ConfigParser
-import os
-import inspect
-CF = ConfigParser()
-__fnm = os.path.join(os.path.dirname(__file__), '../conf/sys.cnf')
-if not os.path.exists(__fnm):
-    __fnm = os.path.join(os.path.dirname(__file__), '../../conf/sys.cnf')
-assert os.path.exists(
-    __fnm), f"【EXCEPTION】can't find {__fnm}." + os.path.dirname(__file__)
-if not os.path.exists(__fnm):
-    __fnm = "./sys.cnf"
-CF.read(__fnm)
-class Config:
-    def __init__(self, env):
-        self.env = env
-        if env == "spark":
-            CF.read("./cv.cnf")
-    def get(self, key, default=None):
-        global CF
-        return os.environ.get(key.upper(),
-                              CF[self.env].get(key, default)
-                              )
-def init(env):
-    return Config(env)

python/util/db_conn.py DELETED Viewed

@@ -1,70 +0,0 @@
-import logging
-import time
-from util import config
-import pandas as pd
-class Postgres(object):
-    def __init__(self, env, dbnm):
-        self.config = config.init(env)
-        self.conn = None
-        self.dbnm = dbnm
-        self.__open__()
-    def __open__(self):
-        import psycopg2
-        try:
-            if self.conn:
-                self.__close__()
-            del self.conn
-        except Exception as e:
-            pass
-        try:
-            self.conn = psycopg2.connect(f"""dbname={self.dbnm}
-                                         user={self.config.get('postgres_user')}
-                                         password={self.config.get('postgres_password')}
-                                         host={self.config.get('postgres_host')}
-                                         port={self.config.get('postgres_port')}""")
-        except Exception as e:
-            logging.error(
-                "Fail to connect %s " %
-                self.config.get("pgdb_host") + str(e))
-    def __close__(self):
-        try:
-            self.conn.close()
-        except Exception as e:
-            logging.error(
-                "Fail to close %s " %
-                self.config.get("pgdb_host") + str(e))
-    def select(self, sql):
-        for _ in range(10):
-            try:
-                return pd.read_sql(sql, self.conn)
-            except Exception as e:
-                logging.error(f"Fail to exec {sql}  " + str(e))
-                self.__open__()
-                time.sleep(1)
-        return pd.DataFrame()
-    def update(self, sql):
-        for _ in range(10):
-            try:
-                cur = self.conn.cursor()
-                cur.execute(sql)
-                updated_rows = cur.rowcount
-                self.conn.commit()
-                cur.close()
-                return updated_rows
-            except Exception as e:
-                logging.error(f"Fail to exec {sql}  " + str(e))
-                self.__open__()
-                time.sleep(1)
-        return 0
-if __name__ == "__main__":
-    Postgres("infiniflow", "docgpt")

python/util/setup_logging.py DELETED Viewed

@@ -1,36 +0,0 @@
-import json
-import logging.config
-import os
-def log_dir():
-    fnm = os.path.join(os.path.dirname(__file__), '../log/')
-    if not os.path.exists(fnm):
-        fnm = os.path.join(os.path.dirname(__file__), '../../log/')
-    assert os.path.exists(fnm), f"Can't locate log dir: {fnm}"
-    return fnm
-def setup_logging(default_path="conf/logging.json",
-                  default_level=logging.INFO,
-                  env_key="LOG_CFG"):
-    path = default_path
-    value = os.getenv(env_key, None)
-    if value:
-        path = value
-    if os.path.exists(path):
-        with open(path, "r") as f:
-            config = json.load(f)
-            fnm = log_dir()
-            config["handlers"]["info_file_handler"]["filename"] = fnm + "info.log"
-            config["handlers"]["error_file_handler"]["filename"] = fnm + "error.log"
-            logging.config.dictConfig(config)
-    else:
-        logging.basicConfig(level=default_level)
-__fnm = os.path.join(os.path.dirname(__file__), 'conf/logging.json')
-if not os.path.exists(__fnm):
-    __fnm = os.path.join(os.path.dirname(__file__), '../../conf/logging.json')
-setup_logging(__fnm)

rag/__init__.py ADDED Viewed

File without changes

rag/llm/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from .embedding_model import *
+from .chat_model import *
+from .cv_model import *
+EmbeddingModel = {
+    "local": HuEmbedding,
+    "OpenAI": OpenAIEmbed,
+    "通义千问": QWenEmbed,
+}
+CvModel = {
+    "OpenAI": GptV4,
+    "通义千问": QWenCV,
+}

{python → rag}/llm/chat_model.py RENAMED Viewed

@@ -1,3 +1,18 @@
 from abc import ABC
 from openai import OpenAI
 import os

+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 from abc import ABC
 from openai import OpenAI
 import os

{python → rag}/llm/cv_model.py RENAMED Viewed

@@ -1,3 +1,18 @@
 from abc import ABC
 from openai import OpenAI
 import os
@@ -6,6 +21,9 @@ from io import BytesIO
 class Base(ABC):
     def describe(self, image, max_tokens=300):
         raise NotImplementedError("Please implement encode method!")
@@ -40,14 +58,15 @@ class Base(ABC):
 class GptV4(Base):
-    def __init__(self):
-        self.client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
     def describe(self, image, max_tokens=300):
         b64 = self.image2base64(image)
         res = self.client.chat.completions.create(
-            model="gpt-4-vision-preview",
             messages=self.prompt(b64),
             max_tokens=max_tokens,
         )
@@ -55,11 +74,15 @@ class GptV4(Base):
 class QWenCV(Base):
     def describe(self, image, max_tokens=300):
         from http import HTTPStatus
         from dashscope import MultiModalConversation
-        # export DASHSCOPE_API_KEY=YOUR_DASHSCOPE_API_KEY
-        response = MultiModalConversation.call(model=MultiModalConversation.Models.qwen_vl_chat_v1,
                                                messages=self.prompt(self.image2base64(image)))
         if response.status_code == HTTPStatus.OK:
             return response.output.choices[0]['message']['content']

+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 from abc import ABC
 from openai import OpenAI
 import os
 class Base(ABC):
+    def __init__(self, key, model_name):
+        pass
     def describe(self, image, max_tokens=300):
         raise NotImplementedError("Please implement encode method!")
 class GptV4(Base):
+    def __init__(self, key, model_name="gpt-4-vision-preview"):
+        self.client = OpenAI(key)
+        self.model_name = model_name
     def describe(self, image, max_tokens=300):
         b64 = self.image2base64(image)
         res = self.client.chat.completions.create(
+            model=self.model_name,
             messages=self.prompt(b64),
             max_tokens=max_tokens,
         )
 class QWenCV(Base):
+    def __init__(self, key, model_name="qwen-vl-chat-v1"):
+        import dashscope
+        dashscope.api_key = key
+        self.model_name = model_name
     def describe(self, image, max_tokens=300):
         from http import HTTPStatus
         from dashscope import MultiModalConversation
+        response = MultiModalConversation.call(model=self.model_name,
                                                messages=self.prompt(self.image2base64(image)))
         if response.status_code == HTTPStatus.OK:
             return response.output.choices[0]['message']['content']

{python → rag}/llm/embedding_model.py RENAMED Viewed

@@ -1,12 +1,35 @@
 from abc import ABC
 from openai import OpenAI
 from FlagEmbedding import FlagModel
 import torch
 import os
 import numpy as np
 class Base(ABC):
     def encode(self, texts: list, batch_size=32):
         raise NotImplementedError("Please implement encode method!")
@@ -28,34 +51,44 @@ class HuEmbedding(Base):
                                query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章：",
                                use_fp16=torch.cuda.is_available())
     def encode(self, texts: list, batch_size=32):
         res = []
         for i in range(0, len(texts), batch_size):
             res.extend(self.model.encode(texts[i:i + batch_size]).tolist())
-        return np.array(res)
-class GptEmbed(Base):
-    def __init__(self):
-        self.client = OpenAI(api_key=os.envirement["OPENAI_API_KEY"])
     def encode(self, texts: list, batch_size=32):
         res = self.client.embeddings.create(input=texts,
-                                            model="text-embedding-ada-002")
-        return [d["embedding"] for d in res["data"]]
-class QWenEmbd(Base):
     def encode(self, texts: list, batch_size=32, text_type="document"):
-        # export DASHSCOPE_API_KEY=YOUR_DASHSCOPE_API_KEY
         import dashscope
-        from http import HTTPStatus
         res = []
         for txt in texts:
             resp = dashscope.TextEmbedding.call(
-                model=dashscope.TextEmbedding.Models.text_embedding_v2,
                 input=txt[:2048],
                 text_type=text_type
             )
             res.append(resp["output"]["embeddings"][0]["embedding"])
-        return res

+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 from abc import ABC
+import dashscope
 from openai import OpenAI
 from FlagEmbedding import FlagModel
 import torch
 import os
 import numpy as np
+from rag.utils import num_tokens_from_string
 class Base(ABC):
+    def __init__(self, key, model_name):
+        pass
     def encode(self, texts: list, batch_size=32):
         raise NotImplementedError("Please implement encode method!")
                                query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章：",
                                use_fp16=torch.cuda.is_available())
     def encode(self, texts: list, batch_size=32):
+        token_count = 0
+        for t in texts: token_count += num_tokens_from_string(t)
         res = []
         for i in range(0, len(texts), batch_size):
             res.extend(self.model.encode(texts[i:i + batch_size]).tolist())
+        return np.array(res), token_count
+class OpenAIEmbed(Base):
+    def __init__(self, key, model_name="text-embedding-ada-002"):
+        self.client = OpenAI(key)
+        self.model_name = model_name
     def encode(self, texts: list, batch_size=32):
+        token_count = 0
+        for t in texts: token_count += num_tokens_from_string(t)
         res = self.client.embeddings.create(input=texts,
+                                            model=self.model_name)
+        return [d["embedding"] for d in res["data"]], token_count
+class QWenEmbed(Base):
+    def __init__(self, key, model_name="text_embedding_v2"):
+        dashscope.api_key = key
+        self.model_name = model_name
     def encode(self, texts: list, batch_size=32, text_type="document"):
         import dashscope
         res = []
+        token_count = 0
         for txt in texts:
             resp = dashscope.TextEmbedding.call(
+                model=self.model_name,
                 input=txt[:2048],
                 text_type=text_type
             )
             res.append(resp["output"]["embeddings"][0]["embedding"])
+            token_count += resp["usage"]["total_tokens"]
+        return res, token_count

rag/nlp/__init__.py ADDED Viewed

File without changes

{python → rag}/nlp/huchunk.py RENAMED Viewed

File without changes

{python → rag}/nlp/huqie.py RENAMED Viewed

@@ -9,6 +9,8 @@ import string
 import sys
 from hanziconv import HanziConv
 class Huqie:
     def key_(self, line):
@@ -41,14 +43,7 @@ class Huqie:
         self.DEBUG = debug
         self.DENOMINATOR = 1000000
         self.trie_ = datrie.Trie(string.printable)
-        self.DIR_ = ""
-        if os.path.exists("../res/huqie.txt"):
-            self.DIR_ = "../res/huqie"
-        if os.path.exists("./res/huqie.txt"):
-            self.DIR_ = "./res/huqie"
-        if os.path.exists("./huqie.txt"):
-            self.DIR_ = "./huqie"
-        assert self.DIR_, f"【Can't find huqie】"
         self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》，。？、；‘’：“”【】~！￥%……（）——-]+|[a-z\.-]+|[0-9,\.-]+)"
         try:

 import sys
 from hanziconv import HanziConv
+from web_server.utils.file_utils import get_project_base_directory
 class Huqie:
     def key_(self, line):
         self.DEBUG = debug
         self.DENOMINATOR = 1000000
         self.trie_ = datrie.Trie(string.printable)
+        self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")
         self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》，。？、；‘’：“”【】~！￥%……（）——-]+|[a-z\.-]+|[0-9,\.-]+)"
         try:

{python → rag}/nlp/query.py RENAMED Viewed

@@ -1,12 +1,12 @@
 import json
 import re
-import sys
-import os
 import logging
 import copy
 import math
 from elasticsearch_dsl import Q, Search
-from nlp import huqie, term_weight, synonym
 class EsQueryer:

+# -*- coding: utf-8 -*-
 import json
 import re
 import logging
 import copy
 import math
 from elasticsearch_dsl import Q, Search
+from rag.nlp import huqie, term_weight, synonym
 class EsQueryer:

{python → rag}/nlp/search.py RENAMED Viewed

@@ -1,13 +1,11 @@
 import re
 from elasticsearch_dsl import Q, Search, A
 from typing import List, Optional, Tuple, Dict, Union
 from dataclasses import dataclass
-from util import setup_logging, rmSpace
-from nlp import huqie, query
-from datetime import datetime
-from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
 import numpy as np
-from copy import deepcopy
 def index_name(uid): return f"docgpt_{uid}"

+# -*- coding: utf-8 -*-
 import re
 from elasticsearch_dsl import Q, Search, A
 from typing import List, Optional, Tuple, Dict, Union
 from dataclasses import dataclass
+from rag.utils import rmSpace
+from rag.nlp import huqie, query
 import numpy as np
 def index_name(uid): return f"docgpt_{uid}"

{python → rag}/nlp/synonym.py RENAMED Viewed

@@ -1,8 +1,11 @@
 import json
 import time
 import logging
 import re
 class Dealer:
     def __init__(self, redis=None):
@@ -10,18 +13,12 @@ class Dealer:
         self.lookup_num = 100000000
         self.load_tm = time.time() - 1000000
         self.dictionary = None
         try:
-            self.dictionary = json.load(open("./synonym.json", 'r'))
-        except Exception as e:
-            pass
-        try:
-            self.dictionary = json.load(open("./res/synonym.json", 'r'))
         except Exception as e:
-            try:
-                self.dictionary = json.load(open("../res/synonym.json", 'r'))
-            except Exception as e:
-                logging.warn("Miss synonym.json")
-                self.dictionary = {}
         if not redis:
             logging.warning(

 import json
+import os
 import time
 import logging
 import re
+from web_server.utils.file_utils import get_project_base_directory
 class Dealer:
     def __init__(self, redis=None):
         self.lookup_num = 100000000
         self.load_tm = time.time() - 1000000
         self.dictionary = None
+        path = os.path.join(get_project_base_directory(), "rag/res", "synonym.json")
         try:
+            self.dictionary = json.load(open(path, 'r'))
         except Exception as e:
+            logging.warn("Miss synonym.json")
+            self.dictionary = {}
         if not redis:
             logging.warning(

{python → rag}/nlp/term_weight.py RENAMED Viewed

@@ -1,9 +1,11 @@
 import math
 import json
 import re
 import os
 import numpy as np
-from nlp import huqie
 class Dealer:
@@ -60,16 +62,14 @@ class Dealer:
                 return set(res.keys())
             return res
-        fnm = os.path.join(os.path.dirname(__file__), '../res/')
-        if not os.path.exists(fnm):
-            fnm = os.path.join(os.path.dirname(__file__), '../../res/')
         self.ne, self.df = {}, {}
         try:
-            self.ne = json.load(open(fnm + "ner.json", "r"))
         except Exception as e:
             print("[WARNING] Load ner.json FAIL!")
         try:
-            self.df = load_dict(fnm + "term.freq")
         except Exception as e:
             print("[WARNING] Load term.freq FAIL!")

+# -*- coding: utf-8 -*-
 import math
 import json
 import re
 import os
 import numpy as np
+from rag.nlp import huqie
+from web_server.utils.file_utils import get_project_base_directory
 class Dealer:
                 return set(res.keys())
             return res
+        fnm = os.path.join(get_project_base_directory(), "res")
         self.ne, self.df = {}, {}
         try:
+            self.ne = json.load(open(os.path.join(fnm, "ner.json"), "r"))
         except Exception as e:
             print("[WARNING] Load ner.json FAIL!")
         try:
+            self.df = load_dict(os.path.join(fnm, "term.freq"))
         except Exception as e:
             print("[WARNING] Load term.freq FAIL!")

{python → rag}/parser/__init__.py RENAMED Viewed

File without changes

{python → rag}/parser/docx_parser.py RENAMED Viewed

@@ -1,8 +1,9 @@
 from docx import Document
 import re
 import pandas as pd
 from collections import Counter
-from nlp import huqie
 from io import BytesIO

+# -*- coding: utf-8 -*-
 from docx import Document
 import re
 import pandas as pd
 from collections import Counter
+from rag.nlp import huqie
 from io import BytesIO

{python → rag}/parser/excel_parser.py RENAMED Viewed

@@ -1,3 +1,4 @@
 from openpyxl import load_workbook
 import sys
 from io import BytesIO
@@ -12,11 +13,18 @@ class HuExcelParser:
         res = []
         for sheetname in wb.sheetnames:
             ws = wb[sheetname]
-            lines = []
-            for r in ws.rows:
-                lines.append(
-                    "\t".join([str(c.value) if c.value is not None else "" for c in r]))
-            res.append(f"《{sheetname}》\n" + "\n".join(lines))
         return res

+# -*- coding: utf-8 -*-
 from openpyxl import load_workbook
 import sys
 from io import BytesIO
         res = []
         for sheetname in wb.sheetnames:
             ws = wb[sheetname]
+            rows = list(ws.rows)
+            ti = list(rows[0])
+            for r in list(rows[1:]):
+                l = []
+                for i,c in enumerate(r):
+                    if not c.value:continue
+                    t = str(ti[i].value) if i < len(ti) else ""
+                    t += ("："  if t else "") + str(c.value)
+                    l.append(t)
+                l = "; ".join(l)
+                if sheetname.lower().find("sheet") <0: l += " ——"+sheetname
+                res.append(l)
         return res

{python → rag}/parser/pdf_parser.py RENAMED Viewed

@@ -1,3 +1,4 @@
 import xgboost as xgb
 from io import BytesIO
 import torch
@@ -6,11 +7,11 @@ import pdfplumber
 import logging
 from PIL import Image
 import numpy as np
-from nlp import huqie
 from collections import Counter
 from copy import deepcopy
-from cv.table_recognize import TableTransformer
-from cv.ppdetection import PPDet
 from huggingface_hub import hf_hub_download
 logging.getLogger("pdfminer").setLevel(logging.WARNING)

+# -*- coding: utf-8 -*-
 import xgboost as xgb
 from io import BytesIO
 import torch
 import logging
 from PIL import Image
 import numpy as np
+from rag.nlp import huqie
 from collections import Counter
 from copy import deepcopy
+from rag.cv.table_recognize import TableTransformer
+from rag.cv.ppdetection import PPDet
 from huggingface_hub import hf_hub_download
 logging.getLogger("pdfminer").setLevel(logging.WARNING)

{python → rag}/res/huqie.txt RENAMED Viewed

File without changes

{python → rag}/res/ner.json RENAMED Viewed

File without changes